import sys
import os
sys.path.append(os.path.dirname(sys.path[0]))  # 上级目录
# from get_cookie import Get_cookie
import requests
import logging
logging.captureWarnings(True)
import os
from lxml import etree
import re

os.environ['NO_PROXY'] = 'stackoverflow.com'
from datetime import datetime, timedelta
from sqlalchemy import create_engine, delete
import pandas as pd
import json
from multiprocessing import Pool
import time
from all_connect import ConnectSpider
Con = ConnectSpider()
import threading
from queue import Queue
import queue
from switch_ip import pppoe_ip
import random
from ast import literal_eval
from cookie_list import cookie_list
from amazon_params import py_ja3
from search_term import search_terms
class Get_asin:
    def __init__(self):
        self.headers = {
            'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
            'accept-language': 'zh-CN,zh;q=0.9',
            'sec-ch-ua': '"Google Chrome";v="123", "Not:A-Brand";v="8", "Chromium";v="123"',
            'sec-ch-ua-full-version': '"123.0.6312.59"',
            'sec-ch-ua-mobile': '?0',
            'sec-ch-ua-model': '""',
            'sec-ch-ua-platform': '"Windows"',
            'sec-ch-ua-platform-version': '"10.0.0"',
            'sec-fetch-dest': 'document',
            'sec-fetch-mode': 'navigate',
            'sec-fetch-site': 'none',
            'sec-fetch-user': '?1',
            'upgrade-insecure-requests': '1',
            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36',
        }
        self.item_queue = Queue()
        self.MAX_RETRIES = 3

    def get_search_term(self):
        search_term = Con.get_ebay_search_term()
        print(len(search_term))
        with open('search_term.py','w',encoding='utf-8') as f:
            f.write(str(search_term))

    def random_ua(self):
        first_num = random.randint(55, 62)
        third_num = random.randint(0, 3200)
        fourth_num = random.randint(0, 140)
        os_type = [
            '(Windows NT 6.1; WOW64)',
            '(Windows NT 10.0; WOW64)',
            '(X11; Linux x86_64)',
            '(Macintosh; Intel Mac OS X 10_12_6)'
        ]
        chrome_version = 'Chrome/{}.0.{}.{}'.format(first_num, third_num, fourth_num)
        ua = ' '.join(['Mozilla/5.0', random.choice(os_type), 'AppleWebKit/537.36',
                       '(KHTML, like Gecko)', chrome_version, 'Safari/537.36']
                      )
        # userAgent = {"user-agent": ua}
        self.headers['user-agent'] = ua

    def get_asin(self, i,page, cookie_dict,task_queue, retry_count=1):
        url = 'https://www.ebay.com/sch/i.html'
        retries_left = self.MAX_RETRIES - retry_count
        try:
            sess = requests.Session()
            sess.mount('https://www.ebay.com/', py_ja3.DESAdapter())
            params = {
                '_from': 'R40',
                '_fcid': '1',
                '_nkw': f'{i}',
                '_sacat': '1',
                'rt': 'nc',
                '_pgn': f'{page}',
            }
            self.random_ua()
            cookies = literal_eval(cookie_dict)
            response = sess.get(url, params=params, headers=self.headers,cookies=cookies, verify=False, timeout=30)
            if response.status_code == 200:
                pattern = r'"text":"United States - USA"'
                matches = re.findall(pattern, response.text)
                len_cookie = len(matches)
                if len_cookie == 2:  # 判断邮编
                    html = etree.HTML(response.text)
                    # 获取asin
                    hrefs = html.xpath('//ul[@class="srp-results srp-list clearfix"]/li//div[@class="s-item__image"]/a/@href')
                    if hrefs:
                        print(f'{i}的第{page}页')
                        asins = [re.findall('www.ebay.com/itm/(.*?)\?', hh)[0] for hh in hrefs]
                        for asin in asins:
                            item = {'asin': asin, 'created_time': datetime.now(), 'state': 3}
                            try:
                                self.item_queue.put(item,block=True, timeout=3)
                                print(f'{asin}已写入item_queue，111111111111')
                            except Exception as e:
                                print('item_queue0队列已满，写入失败')

                        return True
                    else:
                        print(f'{i} 的最后一页')
                        return False
                else:
                    cookie_list.remove(cookie_dict)
                    new_cookie = random.choice(cookie_list)
                    print(f'邮编不是us,{i}第{page}页,更换cookie')
                    print('cookie剩余：', len(cookie_list))
                    if new_cookie:
                        self.get_asin(i, page, cookie_dict, retry_count + 1)
            else:
                print(response.status_code)
                try:
                    print(22222222222)
                    task_queue.put(i, block=True, timeout=3)
                    print(f'{i}已写入task_queue，222222222222')
                except Exception as e:
                    print('item_queue1队列已满，写入失败')


        except Exception as exc:
            if retries_left > 0:
                wait_time = 2 ** retry_count
                print(f"遇到错误，将在{wait_time}秒后重试...")
                time.sleep(wait_time)
                self.get_asin(i, page, cookie_dict, retry_count + 1)
            else:
                print(f"重试次数耗尽，将search: {i}放进队列, 错误: {exc}")
                try:
                    print(3333333333333333)
                    task_queue.put(i, block=True, timeout=3)
                    print(f'{i}已写入task_queue，3333333333')
                except Exception as e:
                    print('item_queue2队列已满，写入失败')


    def workers(self,task_queue,cookie_dict):
        while not task_queue.empty():
            try:
                search = task_queue.get(timeout=3)
                page = 1
                while page <= 3:  # 更改条件，以便在循环内控制是否继续
                    not_last_page = self.get_asin(search, page, cookie_dict,task_queue)
                    if not not_last_page or page == 3:  # 如果没有更多页面了
                        break  # 退出循环
                    page += 1
                task_queue.task_done()
            except Exception as e:
                print('task_queue1队列为空，get失败')


    def run(self,search_list,cookie_dict):
        task_queue = queue.Queue()
        for search in search_list:
            try:
                task_queue.put(search,block=True, timeout=3)
            except Exception as e:
                print('task_queue3队列已满，写入失败')
        threads = []
        for th in range(1):
            t = threading.Thread(target=self.workers, args=(task_queue,cookie_dict))
            threads.append(t)
            t.start()

        # 等待所有线程完成
        for t in threads:
            t.join()

        items_to_save = []
        # 保存所有收集到的数据

        while not self.item_queue.empty():
            print('准备取出数据..............')
            try:
                item = self.item_queue.get(block=True, timeout=3)
                items_to_save.append(item)
            except Exception as e:
                print('item_queue队列为空，get失败')

        print('准备保存数据..............')
        if items_to_save:
            try:
                print('开始保存数据..............')
                Con.save_ebay_asins(items_to_save)
            except Exception as e:
                print(e)

def workers(start_id, limit):
    search_list = search_terms[start_id:limit]
    cookie_dict = random.choice(cookie_list)
    print(start_id,limit,search_list)
    Get_asin().run(search_list,cookie_dict)

def main():
    # pppoe_ip()
    start_time = datetime.now()
    total = 10000
    parts = 100
    step = total // parts
    intervals = [[i * step, (i + 1) * step] for i in range(parts)]
    print(intervals)
    # start_index = 4700 // 100  # 找到起始点对应的区间索引
    # parts = (10000 - start_index * 100) // 100 + 1  # 计算从起始点到结束点的有效区间数
    # step = 100
    # intervals = [[(start_index + i) * step, ((start_index + i) + 1) * step] for i in range(parts)]
    # print(intervals)

    for section in intervals:
        print(f'第{section}批次')
        minid = section[0]
        maxid = section[1]
        num_processes = 5
        batch_size = 20
        p = Pool(num_processes)
        data_range = []
        while minid < maxid:
            data_range.append((minid, minid + batch_size))
            minid += batch_size
        for start_id, limit in data_range:
            p.apply_async(workers, args=(start_id, limit))
        p.close()
        p.join()

        current_time = datetime.now()
        elapsed_time = current_time - start_time
        if elapsed_time >= timedelta(minutes=5):
            print('时间超过5分钟，切换IP')
            pppoe_ip()
            start_time = current_time  # 重置开始时间
        else:
            random_wait_time = random.randint(120, 200)
            print(f"等待 {random_wait_time} 秒...")
            time.sleep(random_wait_time)
            pppoe_ip()

if __name__ == '__main__':
    main()





