desc_crawl.py 5.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202
  1. # _*_coding:utf-8_*_
  2. from hashlib import new
  3. import json
  4. import os
  5. import sys
  6. import time
  7. from multiprocessing import Process
  8. import time
  9. from datetime import datetime, timedelta
  10. import os
  11. import pickle
  12. import calendar
  13. import re
  14. from copy import deepcopy
  15. import requests
  16. import csv
  17. from commandline_config import Config
  18. from service_invoke import invokeService
  19. class TimeUtil(object):
  20. @classmethod
  21. def parse_timezone(cls, timezone):
  22. """
  23. 解析时区表示
  24. :param timezone: str eg: +8
  25. :return: dict{symbol, offset}
  26. """
  27. result = re.match(r'(?P<symbol>[+-])(?P<offset>\d+)', timezone)
  28. symbol = result.groupdict()['symbol']
  29. offset = int(result.groupdict()['offset'])
  30. return {
  31. 'symbol': symbol,
  32. 'offset': offset
  33. }
  34. @classmethod
  35. def convert_timezone(cls, dt, timezone="+0"):
  36. """默认是utc时间,需要"""
  37. result = cls.parse_timezone(timezone)
  38. symbol = result['symbol']
  39. offset = result['offset']
  40. if symbol == '+':
  41. return dt + timedelta(hours=offset)
  42. elif symbol == '-':
  43. return dt - timedelta(hours=offset)
  44. else:
  45. raise Exception('dont parse timezone format')
  46. def generate_timestamp():
  47. current_GMT = time.gmtime()
  48. # ts stores timestamp
  49. ts = calendar.timegm(current_GMT)
  50. current_time = datetime.utcnow()
  51. convert_now = TimeUtil.convert_timezone(current_time, '+8')
  52. print("current_time: " + str(convert_now))
  53. return str(convert_now)
  54. def main():
  55. # result = os.popen('python ServiceWrapper_ExecuteStage.py 38')
  56. # res = result.read()
  57. # for line in res.splitlines():
  58. # print("\n\n\n\nfinename:\n\n\n\n\n", line)
  59. config = {
  60. "pages": 5,
  61. "test": False,
  62. "test_pages": 3,
  63. }
  64. c = Config(config)
  65. print(c)
  66. csv_reader = csv.reader(open("./raw_data.csv", encoding='utf-8'))
  67. author_list = []
  68. for line in csv_reader:
  69. author_list.append(line[4])
  70. csv_reader = csv.reader(open("./author_list.csv", encoding='utf-8'))
  71. keywords = []
  72. i = 0
  73. for line in csv_reader:
  74. if line[0] not in author_list:
  75. keywords.append(line[0])
  76. else:
  77. print("Will not append keyword %s", line[0])
  78. i += 1
  79. if c.test and i > c.test_pages * 100:
  80. break
  81. # print("author_list:", author_list)
  82. # exit(0)
  83. urlList = ""
  84. i = 0
  85. for keyword in keywords:
  86. url = "https://so.toutiao.com/search?dvpf=pc&source=input&keyword=%s&pd=user&action_type=search_subtab_switch&page_num=0&from=media&cur_tab_title=media\r\n" % keyword
  87. # print(url)
  88. urlList += url
  89. i += 1
  90. if c.test and i > c.test_pages:
  91. break
  92. print(urlList)
  93. # exit(0)
  94. # result = requests.post(
  95. # "http://servicewrapper.naibo.wang/backEnd/invokeService",
  96. # data={"id": 7, # serviceID
  97. # "params": json.dumps({"urlList_0": urlList,
  98. # }),
  99. # })
  100. # descTaskID = int(result.text)
  101. descTaskID = invokeService(
  102. 1, {"urlList_0": urlList})
  103. print("descTaskID: " + str(descTaskID))
  104. # exit(0)
  105. filename = generate_timestamp().replace(" ", "").replace(":", "-")
  106. print("filename:", filename)
  107. command = 'python ServiceWrapper_ExecuteStage_local.py ' + \
  108. str(descTaskID) + ' ' + filename
  109. result = os.system(command)
  110. # authorTaskID = 53
  111. file_name = "task_" + str(descTaskID) + "_" + filename + ".csv"
  112. # file_name = "task_53_2022-10-1723-35-40.881448.csv"
  113. print("file_name:", file_name)
  114. csv_reader = csv.reader(
  115. open("./Data/"+file_name, encoding='utf-8')) # taskID
  116. new_descTaskID = []
  117. i = 0
  118. for line in csv_reader:
  119. # print(line)
  120. if i > 0:
  121. new_descTaskID.append(line)
  122. i += 1
  123. # print(new_author_list)
  124. # new_descTaskID = list(set([tuple(t) for t in new_descTaskID]))
  125. # new_descTaskID = list(set(new_descTaskID)) # 去重
  126. after_remove_duplicate = []
  127. for i in range(len(new_descTaskID)):
  128. try:
  129. if i > 0:
  130. if new_descTaskID[i][2] == new_descTaskID[i-1][2]:
  131. continue
  132. if new_descTaskID[i][2] != "":
  133. zan = new_descTaskID[i][1].split("获赞")[0]
  134. fans = new_descTaskID[i][1].split("粉丝")[0].split("获赞")[1]
  135. follow = new_descTaskID[i][1].split("关注")[0].split("粉丝")[1]
  136. after_remove_duplicate.append(
  137. [new_descTaskID[i][0], zan, fans, follow, new_descTaskID[i][2], new_descTaskID[i][3]])
  138. except:
  139. pass
  140. print("after_remove_duplicate", after_remove_duplicate)
  141. all_collected = []
  142. for author in after_remove_duplicate:
  143. all_collected.append(author[4])
  144. print("all_collected:", all_collected)
  145. for keyword in keywords:
  146. if keyword not in all_collected:
  147. print("keyword not collected:", keyword)
  148. after_remove_duplicate.append(['', '', '', '', keyword, ''])
  149. new_descTaskID = after_remove_duplicate
  150. print("new_descTaskID:", new_descTaskID)
  151. # for i in range(len(keywords)):
  152. # author_list[i] = [keywords[i]].extend(new_descTaskID[i])
  153. # for row in author_list:
  154. # print(row)
  155. with open("raw_data.csv", "a", encoding='utf-8', newline='') as csvfile:
  156. writer = csv.writer(csvfile)
  157. for row in new_descTaskID:
  158. writer.writerow(row)
  159. import xlwt
  160. csv_reader = csv.reader(open("./raw_data.csv", encoding='utf-8'))
  161. all_data = []
  162. for line in csv_reader:
  163. all_data.append(line)
  164. workbook = xlwt.Workbook()
  165. sheet = workbook.add_sheet("Sheet")
  166. for i in range(len(all_data)):
  167. for j in range(len(all_data[i])):
  168. sheet.write(i, j, all_data[i][j])
  169. workbook.save("all_data.xls")
  170. if __name__ == "__main__":
  171. main()