author_crawl.py 4.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151
  1. # _*_coding:utf-8_*_
  2. from hashlib import new
  3. import json
  4. import os
  5. import sys
  6. import time
  7. from multiprocessing import Process
  8. import time
  9. from datetime import datetime, timedelta
  10. import os
  11. import pickle
  12. import calendar
  13. import re
  14. from copy import deepcopy
  15. import requests
  16. import csv
  17. from commandline_config import Config
  18. from service_invoke import invokeService
  19. class TimeUtil(object):
  20. @classmethod
  21. def parse_timezone(cls, timezone):
  22. """
  23. 解析时区表示
  24. :param timezone: str eg: +8
  25. :return: dict{symbol, offset}
  26. """
  27. result = re.match(r'(?P<symbol>[+-])(?P<offset>\d+)', timezone)
  28. symbol = result.groupdict()['symbol']
  29. offset = int(result.groupdict()['offset'])
  30. return {
  31. 'symbol': symbol,
  32. 'offset': offset
  33. }
  34. @classmethod
  35. def convert_timezone(cls, dt, timezone="+0"):
  36. """默认是utc时间,需要"""
  37. result = cls.parse_timezone(timezone)
  38. symbol = result['symbol']
  39. offset = result['offset']
  40. if symbol == '+':
  41. return dt + timedelta(hours=offset)
  42. elif symbol == '-':
  43. return dt - timedelta(hours=offset)
  44. else:
  45. raise Exception('dont parse timezone format')
  46. def generate_timestamp():
  47. current_GMT = time.gmtime()
  48. # ts stores timestamp
  49. ts = calendar.timegm(current_GMT)
  50. current_time = datetime.utcnow()
  51. convert_now = TimeUtil.convert_timezone(current_time, '+8')
  52. print("current_time: " + str(convert_now))
  53. return str(convert_now)
  54. def main():
  55. # result = os.popen('python ServiceWrapper_ExecuteStage.py 38')
  56. # res = result.read()
  57. # for line in res.splitlines():
  58. # print("\n\n\n\nfinename:\n\n\n\n\n", line)
  59. config = {
  60. "pages": 5,
  61. "test": False,
  62. "test_pages": 3,
  63. }
  64. c = Config(config)
  65. print(c)
  66. csv_reader = csv.reader(open("./关键词.csv", encoding='utf-8'))
  67. keywords = []
  68. i = 0
  69. for line in csv_reader:
  70. if i < c.test_pages:
  71. print(line)
  72. i += 1
  73. keywords.append(line[0])
  74. urlList = ""
  75. i = 0
  76. for keyword in keywords:
  77. url = "https://so.toutiao.com/search?dvpf=pc&source=pagination&filter_vendor=site&keyword=%s&pd=synthesis&filter_vendor=site&action_type=pagination&page_num=0\r\n" % keyword
  78. # print(url)
  79. urlList += url
  80. i += 1
  81. if c.test and i > c.test_pages:
  82. break
  83. print(urlList)
  84. # result = requests.post(
  85. # "http://servicewrapper.naibo.wang/backEnd/invokeService",
  86. # data={"id": 6, # serviceID
  87. # "params": json.dumps({"loopTimes_Loop_Click_1": c.pages,
  88. # "urlList_0": urlList,
  89. # }),
  90. # })
  91. # authorTaskID = int(result.text)
  92. authorTaskID = invokeService(
  93. 0, {"loopTimes_Loop_Click_1": c.pages, "urlList_0": urlList})
  94. print("authorTaskID: " + str(authorTaskID))
  95. # exit(0)
  96. filename = generate_timestamp().replace(" ", "").replace(":", "-")
  97. print("filename:", filename)
  98. command = 'python ServiceWrapper_ExecuteStage_local.py ' + \
  99. str(authorTaskID) + ' ' + filename
  100. result = os.system(command)
  101. # authorTaskID = 53
  102. file_name = "task_" + str(authorTaskID) + "_" + filename + ".csv"
  103. # file_name = "task_53_2022-10-1723-35-40.881448.csv"
  104. print("file_name:", file_name)
  105. csv_reader = csv.reader(
  106. open("./Data/"+file_name, encoding='utf-8')) # taskID
  107. new_author_list = []
  108. i = 0
  109. for line in csv_reader:
  110. # print(line)
  111. if i > 0:
  112. new_author_list.append(line[0])
  113. i += 1
  114. # print(new_author_list)
  115. new_author_list = list(set(new_author_list)) # 去重
  116. csv_reader = csv.reader(open("./author_list.csv", encoding='utf-8'))
  117. author_list = []
  118. for line in csv_reader:
  119. author_list.append(line[0])
  120. author_list = list(set(author_list)) # 去重
  121. print("author_list:", author_list)
  122. print("new_author_list:", new_author_list)
  123. real_new_author_list = list(
  124. set(new_author_list).difference(set(author_list)))
  125. print("real_new_author_list:", real_new_author_list)
  126. with open("author_list.csv", "a", encoding='utf-8', newline='') as csvfile:
  127. writer = csv.writer(csvfile)
  128. for row in real_new_author_list:
  129. writer.writerow([row])
  130. if __name__ == '__main__':
  131. main()