webget.cpp 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463
  1. #include <iostream>
  2. #include <unistd.h>
  3. #include <sys/stat.h>
  4. //#include <mutex>
  5. #include <thread>
  6. #include <atomic>
  7. #include <curl/curl.h>
  8. #include "webget.h"
  9. #include "version.h"
  10. #include "misc.h"
  11. #include "logger.h"
  12. #ifdef _WIN32
  13. #ifndef _stat
  14. #define _stat stat
  15. #endif // _stat
  16. #endif // _WIN32
  17. extern bool gPrintDbgInfo, gServeCacheOnFetchFail;
  18. extern int gLogLevel;
  19. /*
  20. typedef std::lock_guard<std::mutex> guarded_mutex;
  21. std::mutex cache_rw_lock;
  22. */
  23. class RWLock
  24. {
  25. #define WRITE_LOCK_STATUS -1
  26. #define FREE_STATUS 0
  27. private:
  28. const std::thread::id NULL_THREAD;
  29. const bool WRITE_FIRST;
  30. std::thread::id m_write_thread_id;
  31. std::atomic_int m_lockCount;
  32. std::atomic_uint m_writeWaitCount;
  33. public:
  34. RWLock(const RWLock&) = delete;
  35. RWLock& operator=(const RWLock&) = delete;
  36. RWLock(bool writeFirst = true): WRITE_FIRST(writeFirst), m_write_thread_id(), m_lockCount(0), m_writeWaitCount(0) {}
  37. virtual ~RWLock() = default;
  38. int readLock()
  39. {
  40. if (std::this_thread::get_id() != m_write_thread_id)
  41. {
  42. int count;
  43. if (WRITE_FIRST)
  44. do {
  45. while ((count = m_lockCount) == WRITE_LOCK_STATUS || m_writeWaitCount > 0);
  46. } while (!m_lockCount.compare_exchange_weak(count, count + 1));
  47. else
  48. do {
  49. while ((count = m_lockCount) == WRITE_LOCK_STATUS);
  50. } while (!m_lockCount.compare_exchange_weak(count, count + 1));
  51. }
  52. return m_lockCount;
  53. }
  54. int readUnlock()
  55. {
  56. if (std::this_thread::get_id() != m_write_thread_id)
  57. --m_lockCount;
  58. return m_lockCount;
  59. }
  60. int writeLock()
  61. {
  62. if (std::this_thread::get_id() != m_write_thread_id)
  63. {
  64. ++m_writeWaitCount;
  65. for (int zero = FREE_STATUS; !m_lockCount.compare_exchange_weak(zero, WRITE_LOCK_STATUS); zero = FREE_STATUS);
  66. --m_writeWaitCount;
  67. m_write_thread_id = std::this_thread::get_id();
  68. }
  69. return m_lockCount;
  70. }
  71. int writeUnlock()
  72. {
  73. if (std::this_thread::get_id() != m_write_thread_id)
  74. {
  75. throw std::runtime_error("writeLock/Unlock mismatch");
  76. }
  77. if (WRITE_LOCK_STATUS != m_lockCount)
  78. {
  79. throw std::runtime_error("RWLock internal error");
  80. }
  81. m_write_thread_id = NULL_THREAD;
  82. m_lockCount.store(FREE_STATUS);
  83. return m_lockCount;
  84. }
  85. };
  86. RWLock cache_rw_lock;
  87. long gMaxAllowedDownloadSize = 1048576L;
  88. //std::string user_agent_str = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36";
  89. std::string user_agent_str = "subconverter/" VERSION " cURL/" LIBCURL_VERSION;
  90. struct curl_progress_data
  91. {
  92. long size_limit = 0L;
  93. };
  94. static inline void curl_init()
  95. {
  96. static bool init = false;
  97. if(!init)
  98. {
  99. curl_global_init(CURL_GLOBAL_ALL);
  100. init = true;
  101. }
  102. }
  103. static int writer(char *data, size_t size, size_t nmemb, std::string *writerData)
  104. {
  105. if(writerData == NULL)
  106. return 0;
  107. writerData->append(data, size*nmemb);
  108. return size * nmemb;
  109. }
  110. static int dummy_writer(char *data, size_t size, size_t nmemb, void *writerData)
  111. {
  112. /// dummy writer, do not save anything
  113. (void)data;
  114. (void)writerData;
  115. return size * nmemb;
  116. }
  117. static int size_checker(void *clientp, curl_off_t dltotal, curl_off_t dlnow, curl_off_t ultotal, curl_off_t ulnow)
  118. {
  119. if(clientp)
  120. {
  121. curl_progress_data *data = reinterpret_cast<curl_progress_data*>(clientp);
  122. if(data->size_limit)
  123. {
  124. if(dltotal > data->size_limit || dlnow > data->size_limit)
  125. return 1;
  126. }
  127. }
  128. return 0;
  129. }
  130. static inline void curl_set_common_options(CURL *curl_handle, const char *url, curl_progress_data *data)
  131. {
  132. curl_easy_setopt(curl_handle, CURLOPT_URL, url);
  133. curl_easy_setopt(curl_handle, CURLOPT_VERBOSE, gLogLevel == LOG_LEVEL_VERBOSE ? 1L : 0L);
  134. curl_easy_setopt(curl_handle, CURLOPT_NOPROGRESS, 0L);
  135. curl_easy_setopt(curl_handle, CURLOPT_NOSIGNAL, 1L);
  136. curl_easy_setopt(curl_handle, CURLOPT_FOLLOWLOCATION, 1L);
  137. curl_easy_setopt(curl_handle, CURLOPT_MAXREDIRS, 20L);
  138. curl_easy_setopt(curl_handle, CURLOPT_SSL_VERIFYPEER, 0L);
  139. curl_easy_setopt(curl_handle, CURLOPT_SSL_VERIFYHOST, 0L);
  140. curl_easy_setopt(curl_handle, CURLOPT_TIMEOUT, 15L);
  141. curl_easy_setopt(curl_handle, CURLOPT_USERAGENT, user_agent_str.data());
  142. if(data)
  143. {
  144. if(data->size_limit)
  145. curl_easy_setopt(curl_handle, CURLOPT_MAXFILESIZE, data->size_limit);
  146. curl_easy_setopt(curl_handle, CURLOPT_XFERINFOFUNCTION, size_checker);
  147. curl_easy_setopt(curl_handle, CURLOPT_XFERINFODATA, data);
  148. }
  149. }
  150. //static std::string curlGet(const std::string &url, const std::string &proxy, std::string &response_headers, CURLcode &return_code, const string_map &request_headers)
  151. static int curlGet(const FetchArgument &argument, FetchResult &result)
  152. {
  153. CURL *curl_handle;
  154. std::string *data = result.content, new_url = argument.url;
  155. struct curl_slist *list = NULL;
  156. defer(curl_slist_free_all(list);)
  157. long retVal = 0;
  158. curl_init();
  159. curl_handle = curl_easy_init();
  160. if(argument.proxy.size())
  161. {
  162. if(startsWith(argument.proxy, "cors:"))
  163. {
  164. list = curl_slist_append(list, "X-Requested-With: subconverter " VERSION);
  165. new_url = argument.proxy.substr(5) + argument.url;
  166. }
  167. else
  168. curl_easy_setopt(curl_handle, CURLOPT_PROXY, argument.proxy.data());
  169. }
  170. curl_progress_data limit;
  171. limit.size_limit = gMaxAllowedDownloadSize;
  172. curl_set_common_options(curl_handle, new_url.data(), &limit);
  173. if(argument.request_headers)
  174. {
  175. for(auto &x : *argument.request_headers)
  176. list = curl_slist_append(list, (x.first + ": " + x.second).data());
  177. }
  178. list = curl_slist_append(list, "SubConverter-Request: 1");
  179. list = curl_slist_append(list, "SubConverter-Version: " VERSION);
  180. if(list)
  181. curl_easy_setopt(curl_handle, CURLOPT_HTTPHEADER, list);
  182. if(result.content)
  183. {
  184. curl_easy_setopt(curl_handle, CURLOPT_WRITEFUNCTION, writer);
  185. curl_easy_setopt(curl_handle, CURLOPT_WRITEDATA, result.content);
  186. }
  187. else
  188. curl_easy_setopt(curl_handle, CURLOPT_WRITEFUNCTION, dummy_writer);
  189. if(result.response_headers)
  190. {
  191. curl_easy_setopt(curl_handle, CURLOPT_HEADERFUNCTION, writer);
  192. curl_easy_setopt(curl_handle, CURLOPT_HEADERDATA, result.response_headers);
  193. }
  194. else
  195. curl_easy_setopt(curl_handle, CURLOPT_HEADERFUNCTION, dummy_writer);
  196. unsigned int fail_count = 0, max_fails = 1;
  197. while(true)
  198. {
  199. *result.status_code = curl_easy_perform(curl_handle);
  200. if(*result.status_code == CURLE_OK || max_fails >= fail_count)
  201. break;
  202. else
  203. fail_count++;
  204. }
  205. curl_easy_getinfo(curl_handle, CURLINFO_HTTP_CODE, &retVal);
  206. curl_easy_cleanup(curl_handle);
  207. if(data)
  208. {
  209. if(*result.status_code != CURLE_OK || retVal != 200)
  210. data->clear();
  211. data->shrink_to_fit();
  212. }
  213. return *result.status_code;
  214. }
  215. // data:[<mediatype>][;base64],<data>
  216. static std::string dataGet(const std::string &url)
  217. {
  218. if (!startsWith(url, "data:"))
  219. return std::string();
  220. std::string::size_type comma = url.find(',');
  221. if (comma == std::string::npos || comma == url.size() - 1)
  222. return std::string();
  223. std::string data = UrlDecode(url.substr(comma + 1));
  224. if (endsWith(url.substr(0, comma), ";base64")) {
  225. return urlsafe_base64_decode(data);
  226. } else {
  227. return data;
  228. }
  229. }
  230. std::string buildSocks5ProxyString(const std::string &addr, int port, const std::string &username, const std::string &password)
  231. {
  232. std::string authstr = username.size() && password.size() ? username + ":" + password + "@" : "";
  233. std::string proxystr = "socks5://" + authstr + addr + ":" + std::to_string(port);
  234. return proxystr;
  235. }
  236. std::string webGet(const std::string &url, const std::string &proxy, unsigned int cache_ttl, std::string *response_headers, string_map *request_headers)
  237. {
  238. int return_code = 0;
  239. std::string content;
  240. FetchArgument argument {url, proxy, request_headers, cache_ttl};
  241. FetchResult fetch_res {&return_code, &content, response_headers};
  242. if (startsWith(url, "data:"))
  243. return dataGet(url);
  244. // cache system
  245. if(cache_ttl > 0)
  246. {
  247. md("cache");
  248. const std::string url_md5 = getMD5(url);
  249. const std::string path = "cache/" + url_md5, path_header = path + "_header";
  250. struct stat result;
  251. if(stat(path.data(), &result) == 0) // cache exist
  252. {
  253. time_t mtime = result.st_mtime, now = time(NULL); // get cache modified time and current time
  254. if(difftime(now, mtime) <= cache_ttl) // within TTL
  255. {
  256. writeLog(0, "CACHE HIT: '" + url + "', using local cache.");
  257. //guarded_mutex guard(cache_rw_lock);
  258. cache_rw_lock.readLock();
  259. defer(cache_rw_lock.readUnlock();)
  260. if(response_headers)
  261. *response_headers = fileGet(path_header, true);
  262. return fileGet(path, true);
  263. }
  264. writeLog(0, "CACHE MISS: '" + url + "', TTL timeout, creating new cache."); // out of TTL
  265. }
  266. else
  267. writeLog(0, "CACHE NOT EXIST: '" + url + "', creating new cache.");
  268. //content = curlGet(url, proxy, response_headers, return_code); // try to fetch data
  269. curlGet(argument, fetch_res);
  270. if(return_code == CURLE_OK) // success, save new cache
  271. {
  272. //guarded_mutex guard(cache_rw_lock);
  273. cache_rw_lock.writeLock();
  274. defer(cache_rw_lock.writeUnlock();)
  275. fileWrite(path, content, true);
  276. if(response_headers)
  277. fileWrite(path_header, *response_headers, true);
  278. }
  279. else
  280. {
  281. if(fileExist(path) && gServeCacheOnFetchFail) // failed, check if cache exist
  282. {
  283. writeLog(0, "Fetch failed. Serving cached content."); // cache exist, serving cache
  284. //guarded_mutex guard(cache_rw_lock);
  285. cache_rw_lock.readLock();
  286. defer(cache_rw_lock.readUnlock();)
  287. content = fileGet(path, true);
  288. if(response_headers)
  289. *response_headers = fileGet(path_header, true);
  290. }
  291. else
  292. writeLog(0, "Fetch failed. No local cache available."); // cache not exist or not allow to serve cache, serving nothing
  293. }
  294. return content;
  295. }
  296. //return curlGet(url, proxy, response_headers, return_code);
  297. curlGet(argument, fetch_res);
  298. return content;
  299. }
  300. void flushCache()
  301. {
  302. //guarded_mutex guard(cache_rw_lock);
  303. cache_rw_lock.writeLock();
  304. defer(cache_rw_lock.writeUnlock();)
  305. operateFiles("cache", [](std::string file){ remove(("cache/" + file).data()); return 0; });
  306. }
  307. int curlPost(const std::string &url, const std::string &data, const std::string &proxy, const string_array &request_headers, std::string *retData)
  308. {
  309. CURL *curl_handle;
  310. CURLcode res;
  311. struct curl_slist *list = NULL;
  312. long retVal = 0;
  313. curl_init();
  314. curl_handle = curl_easy_init();
  315. list = curl_slist_append(list, "Content-Type: application/json;charset='utf-8'");
  316. for(const std::string &x : request_headers)
  317. list = curl_slist_append(list, x.data());
  318. curl_progress_data limit;
  319. curl_set_common_options(curl_handle, url.data(), &limit);
  320. curl_easy_setopt(curl_handle, CURLOPT_POST, 1L);
  321. curl_easy_setopt(curl_handle, CURLOPT_POSTFIELDS, data.data());
  322. curl_easy_setopt(curl_handle, CURLOPT_POSTFIELDSIZE, data.size());
  323. curl_easy_setopt(curl_handle, CURLOPT_WRITEFUNCTION, writer);
  324. curl_easy_setopt(curl_handle, CURLOPT_WRITEDATA, retData);
  325. curl_easy_setopt(curl_handle, CURLOPT_HTTPHEADER, list);
  326. if(proxy.size())
  327. curl_easy_setopt(curl_handle, CURLOPT_PROXY, proxy.data());
  328. res = curl_easy_perform(curl_handle);
  329. curl_slist_free_all(list);
  330. if(res == CURLE_OK)
  331. {
  332. curl_easy_getinfo(curl_handle, CURLINFO_HTTP_CODE, &retVal);
  333. }
  334. curl_easy_cleanup(curl_handle);
  335. return retVal;
  336. }
  337. int webPost(const std::string &url, const std::string &data, const std::string &proxy, const string_array &request_headers, std::string *retData)
  338. {
  339. return curlPost(url, data, proxy, request_headers, retData);
  340. }
  341. int curlPatch(const std::string &url, const std::string &data, const std::string &proxy, const string_array &request_headers, std::string *retData)
  342. {
  343. CURL *curl_handle;
  344. CURLcode res;
  345. long retVal = 0;
  346. struct curl_slist *list = NULL;
  347. curl_init();
  348. curl_handle = curl_easy_init();
  349. list = curl_slist_append(list, "Content-Type: application/json;charset='utf-8'");
  350. for(const std::string &x : request_headers)
  351. list = curl_slist_append(list, x.data());
  352. curl_progress_data limit;
  353. curl_set_common_options(curl_handle, url.data(), &limit);
  354. curl_easy_setopt(curl_handle, CURLOPT_CUSTOMREQUEST, "PATCH");
  355. curl_easy_setopt(curl_handle, CURLOPT_POSTFIELDS, data.data());
  356. curl_easy_setopt(curl_handle, CURLOPT_POSTFIELDSIZE, data.size());
  357. curl_easy_setopt(curl_handle, CURLOPT_WRITEFUNCTION, writer);
  358. curl_easy_setopt(curl_handle, CURLOPT_WRITEDATA, retData);
  359. curl_easy_setopt(curl_handle, CURLOPT_HTTPHEADER, list);
  360. if(proxy.size())
  361. curl_easy_setopt(curl_handle, CURLOPT_PROXY, proxy.data());
  362. res = curl_easy_perform(curl_handle);
  363. curl_slist_free_all(list);
  364. if(res == CURLE_OK)
  365. {
  366. res = curl_easy_getinfo(curl_handle, CURLINFO_HTTP_CODE, &retVal);
  367. }
  368. curl_easy_cleanup(curl_handle);
  369. return retVal;
  370. }
  371. int webPatch(const std::string &url, const std::string &data, const std::string &proxy, const string_array &request_headers, std::string *retData)
  372. {
  373. return curlPatch(url, data, proxy, request_headers, retData);
  374. }
  375. int curlHead(const std::string &url, const std::string &proxy, const string_array &request_headers, std::string &response_headers)
  376. {
  377. CURL *curl_handle;
  378. CURLcode res;
  379. long retVal = 0;
  380. struct curl_slist *list = NULL;
  381. curl_init();
  382. curl_handle = curl_easy_init();
  383. list = curl_slist_append(list, "Content-Type: application/json;charset='utf-8'");
  384. for(const std::string &x : request_headers)
  385. list = curl_slist_append(list, x.data());
  386. curl_progress_data limit;
  387. curl_set_common_options(curl_handle, url.data(), &limit);
  388. curl_easy_setopt(curl_handle, CURLOPT_HEADERFUNCTION, writer);
  389. curl_easy_setopt(curl_handle, CURLOPT_HEADERDATA, &response_headers);
  390. curl_easy_setopt(curl_handle, CURLOPT_NOBODY, 1L);
  391. curl_easy_setopt(curl_handle, CURLOPT_HTTPHEADER, list);
  392. if(proxy.size())
  393. curl_easy_setopt(curl_handle, CURLOPT_PROXY, proxy.data());
  394. res = curl_easy_perform(curl_handle);
  395. curl_slist_free_all(list);
  396. if(res == CURLE_OK)
  397. res = curl_easy_getinfo(curl_handle, CURLINFO_HTTP_CODE, &retVal);
  398. curl_easy_cleanup(curl_handle);
  399. return retVal;
  400. }
  401. int webHead(const std::string &url, const std::string &proxy, const string_array &request_headers, std::string &response_headers)
  402. {
  403. return curlHead(url, proxy, request_headers, response_headers);
  404. }