编码使我快乐!!!
我也不知道为什么,遇到自己喜欢的事情,就越想做下去,可以一个月不出门,但是不能一天没有电脑
掌握程度:对python有了一个更清晰的认识,自动化运维,也许可以用python实现呢,加油
实现功能:
爬取响应的网页,并且存入本地文件和DB
本地文件:
DB:
整体逻辑:
1.读取配置文件
1 def ReadLocalFiles() : 2 #定义字典用于存储conf中的变量与值 3 returndict={} 4 5 #定义变量用于存储行数 6 linenumber = 0 7 8 #以只读方式获取文件内容,因为文件需要放在crontab中,故需写全路径 9 localfiles = open(r'/root/python-script/geturl/main.conf')10 11 #获取文件所有值,以list的方式存储在readfilelines中12 readfilelines = localfiles.readlines()13 14 #定义for用于循环list进行抓值15 for line in readfilelines :16 17 #使行数自增1,用于报错显示18 linenumber = linenumber + 119 20 #利用rstrip进行去除多余的空格21 line = line.rstrip()22 23 #判断如果文件开头为#,则continue继续循环24 if line.startswith('#') :25 continue26 27 #判断如果line的长度为0,则continue继续循环28 elif len(line) == 0 : 29 continue30 31 #如果依上条件均不满足32 else:33 34 #执行try语句,如果有异常,使之在控制范围内35 try:36 returndict[line.split('=')[0]] = line.split('=')[1]37 38 #抛出异常39 except:40 #打印错误语句41 print (time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time()))+"line %d %s error,please check" % (linenumber,line))42 #43 localfiles.close()44 #使程序异常退出45 sys.exit(-1)46 47 #返回字典48 localfiles.close()49 return returndict
2.清除已经下载的文件内容
1 def DeleteHisFiles(update_file): 2 3 #判断是否存在update_file这个文件 4 if os.path.isfile(update_file): 5 6 try: 7 #以可读可写方式打开文件 8 download_files = open(update_file,'r+') 9 #清除文件内容10 download_files.truncate()11 #关闭文件12 download_files.close()13 except:14 #报错,清除失败15 print (time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time()))+"Truncate " + update_file + "error , please check it")16 #如果文件在路径下不存在17 else :18 #新建文件19 print (time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time()))+"Build New downfiles ok")
3.检测网址是否有更新
1 def DealwithURL(url): 2 #获取url的内容传入r 3 r = requests.get(url) 4 5 #定义需要查找的正则表达式的值 6 pattern = re.compile('
4.获取更新后的网址
1 def GetNewURL(url): 2 #获取url的内容传入r 3 r = requests.get(url) 4 5 #编码为utf-8 6 r.encoding='utf-8' 7 8 #定义需要查找的正则表达式的值,以alert开头,以">结尾的值 9 pattern = re.compile('alert(.*)">')10 #在r.text中查找pattern的值11 findurl=re.findall(pattern,r.text)12 #将传入的值修改为str格式13 findurl_str = (" ".join(findurl))14 #1.返回findurl_str以空格分割后的下标为1的值15 #2.返回第1步的下标为0的值16 #3.返回第2步的下标为2的值至最后17 return (findurl_str.split(' ',1)[0][2:])
5.存储新的网址进本地配置文件
1 def SaveLocalUrl(untreatedurl,treatedurl): 2 #如果两个值相等 3 if untreatedurl == treatedurl : 4 pass 5 6 else : 7 #执行try语句,如果有异常,使之在控制范围内 8 try: 9 #以只读的方式打开文件10 fileconf = open(r'/root/python-script/geturl/main.conf','r')11 #设置rewritestr为空12 rewritestr = ""13 14 #执行fileconf的for循环 15 for readline in fileconf: 16 17 #如果在readline中有搜索到untreatedurl18 if re.search(untreatedurl,readline): 19 20 #在readline中将untreatedurl值替换为treatedurl值 21 readline = re.sub(untreatedurl,treatedurl,readline) 22 #试rewritestr加上新的readline 23 rewritestr = rewritestr + readline 24 else : 25 #否则的话,也使rewritestr加上新的readline 26 rewritestr = rewritestr + readline 27 #读入完毕将文件关闭 28 fileconf.close() 29 30 #以只写的方式打开文件 31 fileconf = open(r'/root/python-script/geturl/main.conf','w')32 #将rewritestr写入文件 33 fileconf.write(rewritestr)34 #关闭文件 35 fileconf.close() 36 37 except: 38 #打印异常 39 print ("get new url but open files ng write to logs,please check main.conf")
6.随机获取一个网页并且取得总页数
1 def GetTitleInfo(url,down_page,update_file,headers): 2 3 #定义title的值,随机从1——8中抽取一个出来 4 title = '/list/'+str(random.randint(1,8)) 5 6 #定义titleurl的值,为url+title+.html 7 titleurl = url + title + '.html' 8 9 #获取当前的url内容并返回给r10 r = requests.get(titleurl)11 #设置r的编码为当前页面的编码12 r.encoding = chardet.detect(r.content)['encoding']13 ##定义需要查找的正则表达式的值,以' 当前:.*'开头,以'页 '结尾的值14 pattern = re.compile(' 当前:.*/(.*)页 ')15 #在r.text中查找符合pattern的值16 getpagenumber = re.findall(pattern,r.text)17 18 #将getpagenumber值转换为str格式19 getpagenumber = (" ".join(getpagenumber))20 21 GetListURLinfo(url , title , int(getpagenumber) , int(down_page),update_file,headers)
7.Download详细URL
1 def GetListURLinfo(sourceurl , title , getpagenumber , total,update_file,headers): 2 3 #将total的值控制在范围内 4 if total >= 100: 5 total = 100 6 7 if total <= 1: 8 total = 2 9 10 #将total的值赋值给getpagenumber11 getpagenumber = total12 13 #定义for循环,循环次数为total的总数14 for number in range(0,total) :15 try:16 #定义信号值17 signal.signal(signal.SIGALRM,handler)18 #设置alarm为3,若超过3秒,则被抛送异常19 signal.alarm(3)20 21 #定义url的值为小于getpagenumber的随机页面22 url = sourceurl + title + '-' + str(random.randint(1,getpagenumber)) + '.html'23 24 #获取url的内容,并且传值给r25 r = requests.get(url)26 27 #定义需要查找的正则表达式的值,以开头,以
结尾的值28 pattern = re.compile('(.*)
')29 #设置编码为页面编码30 r.encoding = chardet.detect(r.content)['encoding']31 #在r.text中查找符合pattern的值32 allurl = re.findall(pattern,r.text)33 34 #为allurl定义for循环35 for lineurl in allurl:36 try:37 #定义信号值38 signal.signal(signal.SIGALRM,handler)39 #设置alarm为3,若超过3秒,则被抛送异常40 signal.alarm(3)41 42 #定义需要查找的正则表达式的值,以
8.获取每个url的ed2k地址
1 def GetDownloadURL(sourceurl,titleurl,titlename,update_file,headers): 2 #将downurlstr,downnamestr转换str类型 3 downurlstr = (" ".join(titleurl)) 4 downnamestr = (" ".join(titlename)) 5 6 #获取url的内容,并且传值给r 7 r = requests.get((sourceurl+downurlstr)) 8 9 #定义需要查找的正则表达式的值,以autocomplete="on">开头,以/结尾的值10 pattern = re.compile('autocomplete="on">(.*)/')11 12 #在r.text中查找符合pattern的值并且赋值给downurled2k13 downurled2k = re.findall(pattern,r.text)14 #将downurled2k的值修改为str并且赋值给downurled2kstr15 downurled2kstr = (" ".join(downurled2k))16 17 WriteLocalDownloadURL(update_file , downurled2kstr,titlename)18 19 #输出downnamestr 和 downurled2kstr20 print (downnamestr , downurled2kstr)21 22 #定义savedburl为sourceurl + downurlstr的值23 savedburl = sourceurl + downurlstr24 SaveDB(time.strftime('%Y-%m-%d %H:%M',time.localtime(time.time())),titlename[0],downurled2kstr,savedburl)25 # print (time.strftime('%Y-%m-%d %H:%M',time.localtime(time.time())))
9.将文件内容写进本地
1 def WriteLocalDownloadURL(downfile,listfile): 2 3 #以追加的方式打开downfile文件 4 urlfile = open(downfile,'a+') 5 6 #定义循环语句 7 for line in listfile : 8 urlfile.write(line+'\n') 9 10 #关闭文件11 urlfile.close()
10.将文件内容写进DB
1 def SaveDB(nowdate,tilename,downurl,savedburl): 2 3 #获取游标 4 cursor = db.cursor() 5 6 #定义select查询语句 7 sql = "select count(nurl) from downurl where nurl = '%s';" % (downurl) 8 cursor.execute(sql) 9 10 #获取执行的结果11 data = cursor.fetchone()12 13 #判断结果是否为‘0’14 if '0' == str(data[0]):15 #定义insert插入语句16 sql = "insert into downurl values ('%s','%s','%s','%s');" % (nowdate,tilename,savedburl,downurl)17 18 #定义try语句19 try:20 #执行sql语句21 cursor.execute(sql)22 #执行commit语句23 db.commit()24 except:25 #否则执行rollback语句26 db.rollback()
代码综合:
1 root@Linux:~/python-script/geturl # cat main.py 2 #!/usr/bin/python3 3 import requests 4 import re 5 import chardet 6 import random 7 import signal 8 import time 9 import os 10 import sys 11 import pymysql 12 13 14 def DealwithURL(url): 15 r = requests.get(url) 16 pattern = re.compile(' ') 33 findurl=re.findall(pattern,r.text) 34 findurl_str = (" ".join(findurl)) 35 return (findurl_str.split(' ',1)[0][2:]) 36 37 def gettrueurl(url): 38 if DealwithURL(url)==True: 39 return url 40 else : 41 return GetNewURL(DealwithURL(url)) 42 43 def SaveLocalUrl(untreatedurl,treatedurl): 44 if untreatedurl == treatedurl : 45 pass 46 else : 47 try: 48 fileconf = open(r'/root/python-script/geturl/main.conf','r') 49 rewritestr = "" 50 51 for readline in fileconf: 52 if re.search(untreatedurl,readline): 53 readline = re.sub(untreatedurl,treatedurl,readline) 54 rewritestr = rewritestr + readline 55 else : 56 rewritestr = rewritestr + readline 57 fileconf.close() 58 59 fileconf = open(r'/root/python-script/geturl/main.conf','w') 60 fileconf.write(rewritestr) 61 fileconf.close() 62 63 except: 64 print ("get new url but open files ng write to logs") 65 66 def handler(signum,frame): 67 raise AssertionError 68 69 def SaveDB(nowdate,tilename,downurl,savedburl): 70 cursor = db.cursor() 71 72 sql = "select count(nurl) from downurl where nurl = '%s';" % (downurl) 73 cursor.execute(sql) 74 75 data = cursor.fetchone() 76 77 if '0' == str(data[0]): 78 sql = "insert into downurl values ('%s','%s','%s','%s');" % (nowdate,tilename,savedburl,downurl) 79 80 try: 81 cursor.execute(sql) 82 db.commit() 83 except: 84 db.rollback() 85 86 def WriteLocalDownloadURL(downfile,listfile): 87 88 urlfile = open(downfile,'a+') 89 90 for line in listfile : 91 urlfile.write(line+'\n') 92 93 urlfile.close() 94 95 def GetDownloadURL(sourceurl,titleurl,titlename,update_file,headers,enabledb): 96 97 downurlstr = (" ".join(titleurl)) 98 downnamestr = (" ".join(titlename)) 99 100 r = requests.get((sourceurl+downurlstr))101 pattern = re.compile('autocomplete="on">(.*)/')102 103 downurled2k = re.findall(pattern,r.text)104 downurled2kstr = (" ".join(downurled2k))105 106 107 print (downnamestr , downurled2kstr)108 109 if 1 == enabledb :110 savedburl = sourceurl + downurlstr111 SaveDB(time.strftime('%Y-%m-%d %H:%M',time.localtime(time.time())),titlename[0],downurled2kstr,savedburl)112 113 returnstr = titlename[0]+" "+downurled2kstr114 return returnstr115 116 def ReadLocalFiles() :117 returndict={}118 119 linenumber = 0120 121 localfiles = open(r'/root/python-script/geturl/main.conf')122 123 readfilelines = localfiles.readlines()124 125 for line in readfilelines :126 linenumber = linenumber + 1127 128 line = line.rstrip()129 130 if line.startswith('#') :131 continue132 elif len(line) == 0 :133 continue134 else:135 try:136 returndict[line.split('=')[0]] = line.split('=')[1]137 except:138 print ("line %d %s error,please check" % (linenumber,line))139 sys.exit(-1)140 141 return returndict142 143 def GetListURLinfo(sourceurl , title , getpagenumber , total,update_file,headers,enablewritefile,enabledb):144 145 returnwriteurl = []146 147 if total >= 100:148 total = 100149 150 if total <= 1:151 total = 2152 153 getpagenumber = total154 155 for number in range(0,total) :156 try:157 signal.signal(signal.SIGALRM,handler)158 signal.alarm(3)159 160 url = sourceurl + title + '-' + str(random.randint(1,getpagenumber)) + '.html'161 162 r = requests.get(url)163 164 pattern = re.compile('(.*)
')165 r.encoding = chardet.detect(r.content)['encoding']166 allurl = re.findall(pattern,r.text)167 168 169 for lineurl in allurl:170 try:171 signal.signal(signal.SIGALRM,handler)172 signal.alarm(3)173 174 pattern = re.compile('
End