
import sys
import os
sys.path.append(os.path.dirname(sys.path[0]))  # 上级目录
# from get_cookie import Get_cookie
import requests
import logging
logging.captureWarnings(True)
from lxml import etree
import re
sess = requests.Session()
os.environ['NO_PROXY'] = 'stackoverflow.com'
from datetime import datetime, timedelta
from sqlalchemy import create_engine, delete
import pandas as pd
import json
from multiprocessing import Pool
from concurrent.futures import ThreadPoolExecutor
import time
import threading
from queue import Queue
import queue
from ast import literal_eval
from switch_ip import pppoe_ip
import random
from cookie_list import cookie_list
from amazon_params import py_ja3

from all_connect import ConnectSpider
Con = ConnectSpider()

class Ebay_details:
    def __init__(self):
        self.headers = {
            'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
            'accept-language': 'zh-CN,zh;q=0.9',
            'cache-control': 'max-age=0',
            # 'cookie': '__uzma=3834a898-b343-48ce-be3b-be466998b55c; __uzmb=1718071143; __uzme=0795; __gsas=ID=8059fc231f2bad39:T=1718073414:RT=1718073414:S=ALNI_MaCyU2XyvwXKyyApYqyW40gd-jCIA; __ssds=2; __ssuzjsr2=a9be0cd8e; __uzmaj2=c749247e-9164-439e-be1d-a3dfcaae498b; __uzmbj2=1718161765; __uzmcj2=9741018785691; __uzmdj2=1718586226; ak_bmsc=735D87323264076B105663E8EF2EA0AF~000000000000000000000000000000~YAAQL5ZUaIKyfhGQAQAA1Z8uJBjiuNrcH66lpLGBAsN6752jyA5PwDNurQgdcbW4QGo2m989pGPYKvQxDhJ5y0vegb8Gf9gJO8aCysC/4Hz6/NL66Kpaw9Fs7b/8eczfooGAV8JYWwBTuCTj3lHVIBQVM1cK9866RECQT3Y0eg8+tarVB0pkfr2UZQditGnI9xxWS9pvyzogsB5B6kIS6VR6cOsXe9/IOA7v1P/YRz6S+znxuwn3OK5Oe5HZNN2TuVoDqMY+fCe0BMHIZHWako+KFc0rV61zC1py8tkZr+tc2WUH0NjCFNi63FRQBFYbpnBkQBNuGHlgQOgA7yBTiaQypy5wVrlXeCgu/ljffJNiL6u8GA3CNL7VgRLgUZXOdW20gpWF; __gads=ID=882b10b902105443:T=1718073425:RT=1718593954:S=ALNI_MYVZJGGlhg_YD1dApRcqnjHKEBFaA; __gpi=UID=00000e461c3980ad:T=1718073425:RT=1718593954:S=ALNI_MYoqCBqFXWHP0KfuTV-C7ChECVccw; __eoi=ID=c2afcacd7956e3b9:T=1718073425:RT=1718593954:S=AA-AfjZ3CZru5y4wOfNTyyKuNRkO; s=CgAD4ACBmcPu/MDUwNTI3ZWMxOTAwYWI0YzMyMGVhOTRiZmZmZmM3M2V7pHML; ebay=%5Ejs%3D1%5Esbf%3D%23000000%5E; __deba=kPN4vwn7PszfzQJ6XXBIXf8GKimfAlHEndL70uF80azBVfDbVSSVuZ0KJyhDNY_OC2j_qBpze2X7DY6quSOZ1XvsYGMfBBHA5tUvNfGwZ0klQa73WF5AIuaGIHlHKXjtfj4LZjAm97qspTb7TEPDFQ==; __uzmc=4743531091341; __uzmd=1718594213; __uzmf=7f60006b8b7407-e744-4380-8af5-a75b53b78fcf1718071143730523069495-2aa6c7df67825f7d310; AMP_MKTG_f93443b04c=JTdCJTIycmVmZXJyZXIlMjIlM0ElMjJodHRwJTNBJTJGJTJGbG9jYWxob3N0JTNBNjMzNDIlMkZhbWF6b25fc3BpZGVyJTJGYW1hem9uX3NwaWRlciUyRmFtYXpvbl9zcGlkZXIlMkZlYmF5X3NwaWRlciUyRnRlc3QuaHRtbCUzRl9panQlM0Q0czNkdWZsNm90dXEyOThkbWYzdWlkMnY0dSUyNl9pal9yZWxvYWQlM0RSRUxPQURfT05fU0FWRSUyMiUyQyUyMnJlZmVycmluZ19kb21haW4lMjIlM0ElMjJsb2NhbGhvc3QlM0E2MzM0MiUyMiU3RA==; AMP_f93443b04c=JTdCJTIyZGV2aWNlSWQlMjIlM0ElMjIxNGFkZWNhZS1iYjFiLTQ0ZTUtOGExYy02YzA4MGRkZjNhNzYlMjIlMkMlMjJzZXNzaW9uSWQlMjIlM0ExNzE4NTkzOTk0MTExJTJDJTIyb3B0T3V0JTIyJTNBZmFsc2UlMkMlMjJsYXN0RXZlbnRUaW1lJTIyJTNBMTcxODU5NDMxNzIyNCUyQyUyMmxhc3RFdmVudElkJTIyJTNBNjY0JTJDJTIycGFnZUNvdW50ZXIlMjIlM0ExNSU3RA==; dp1=bu1p/QEBfX0BAX19AQA**6a3211a8^pbf/%23e000e000000000000000006850de28^bl/HK6a3211a8^; ns1=BAQAAAY5O25hEAAaAANgAU2hQ3ihjNjl8NjAxXjE3MTgxNjE3NjUzNzJeXjFeM3wyfDV8NHw3fDEwfDQyfDQzfDExXl5eNF4zXjEyXjEyXjJeMV4xXjBeMV4wXjFeNjQ0MjQ1OTA3NW3duuJFWGcTAMSB+bcT4MujB6Of; nonsession=BAQAAAY5O25hEAAaAADMABGhQ3igsR0JSAMoAIGoyEagwNTA1MjdlYzE5MDBhYjRjMzIwZWE5NGJmZmZmYzczZQDLAAFmb7GwNG4fM5Pjo0KvK1oUF0U73lQf58de; bm_sv=000423A9A228BC726B37F24478D97F08~YAAQE/AgF8+PsQyQAQAABqYyJBgSpMmqbyMYlG7JsJ+txsNQfwQPfwlzL9/hcclNirHZNxFSiyVjZDG2AGqikuP476GiR9YnGqn70nPMRetiM1+p+rH2S+nO7kFNN6+Ql7M6URNKp1SK6NO8ht3qWyS3PJgJyMXoO7Pa7WLY0J0ZAasnUnrPeOtPrYSgK00n43TVlzxdDYh4TyyWXr7KFvNFIXAXN5nJoCEdQEPumWQQGgCtlXqU5XpQB8dhKg==~1',
            'referer': 'http://localhost:63342/amazon_spider/amazon_spider/amazon_spider/ebay_spider/test.html?_ijt=4s3dufl6otuq298dmf3uid2v4u&_ij_reload=RELOAD_ON_SAVE',
            'sec-ch-ua': '"Google Chrome";v="123", "Not:A-Brand";v="8", "Chromium";v="123"',
            'sec-ch-ua-full-version': '"123.0.6312.59"',
            'sec-ch-ua-mobile': '?0',
            'sec-ch-ua-model': '""',
            'sec-ch-ua-platform': '"Windows"',
            'sec-ch-ua-platform-version': '"10.0.0"',
            'sec-fetch-dest': 'document',
            'sec-fetch-mode': 'navigate',
            'sec-fetch-site': 'cross-site',
            'sec-fetch-user': '?1',
            'upgrade-insecure-requests': '1',
            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36',
        }
        self.item_queue = Queue()
        self.success_queue = Queue()
        self.failed_queue = Queue()
        self.invalid_queue = Queue()
        self.MAX_RETRIES = 3

    def random_ua(self):
        first_num = random.randint(55, 62)
        third_num = random.randint(0, 3200)
        fourth_num = random.randint(0, 140)
        os_type = [
            '(Windows NT 6.1; WOW64)',
            '(Windows NT 10.0; WOW64)',
            '(X11; Linux x86_64)',
            '(Macintosh; Intel Mac OS X 10_12_6)'
        ]
        chrome_version = 'Chrome/{}.0.{}.{}'.format(first_num, third_num, fourth_num)
        ua = ' '.join(['Mozilla/5.0', random.choice(os_type), 'AppleWebKit/537.36',
                       '(KHTML, like Gecko)', chrome_version, 'Safari/537.36']
                      )
        self.headers['user-agent'] = ua

    def response_data(self, asin,idd,cookie_list_copy,retry_count = 1):
        retries_left = self.MAX_RETRIES - retry_count
        try:
            sess = requests.Session()
            sess.mount('https://www.ebay.com/', py_ja3.DESAdapter())
            self.random_ua()
            if cookie_list_copy:
                cookie_dict = random.choice(cookie_list_copy)

                response = requests.get(f'https://www.ebay.com/itm/{asin}',cookies=cookie_dict,headers=self.headers)
                if response.status_code == 200:
                    # with open('get_pic.py', 'w', encoding='utf-8') as f:
                    #     f.write(response.text)
                    html = etree.HTML(response.text)
                    # 检验是否被重定向
                    href = html.xpath('//head/link[@rel="canonical"]/@href')[0] if html.xpath('//head/link[@rel="canonical"]/@href') else None
                    # print(f'这是href:{href}')
                    if href:
                        href_asin = href.split('/')[-1]
                        if href_asin == asin:
                            pattern = r'"text":"United States"'
                            pattern1 = r'"text":"United States - USA"'
                            matches = re.findall(pattern, response.text)
                            matches1 = re.findall(pattern1, response.text)
                            len_cookie = len(matches)
                            len_cookie1 = len(matches1)
                            if len_cookie >= 1 or len_cookie1 >= 1 :  # 判断邮编
                                self.parse_data(response,asin,idd)
                            else:
                                if cookie_dict in cookie_list_copy:  # 检查cookie是否还在列表中
                                    cookie_list_copy.remove(cookie_dict)
                                new_cookie = random.choice(cookie_list_copy)
                                print(f'{asin}邮编不是us,更换cookie')
                                if new_cookie and retry_count < 3:
                                    self.response_data(asin,idd,new_cookie, retry_count + 1)

                        # 被重定向 请求重定向的url
                        else:
                            res = requests.get(
                                href,
                                cookies = cookie_dict,
                                headers=self.headers)
                            self.parse_data(res, asin, idd)
                    else:
                        print('href为空，可能是cookie原因')
                        new_cookie = random.choice(cookie_list_copy) if cookie_list_copy else None
                        if new_cookie and retry_count < 3:
                            self.response_data(asin, idd, new_cookie, retry_count + 1)

                # 状态码 != 200
                else:
                    # 判断是否失效
                    print(response.status_code,asin,'|-|-|-|-|-|',idd)
                    if retries_left > 0:
                        html = etree.HTML(response.text)
                        target_text = 'We looked everywhere.'
                        looked = html.xpath('//div[@class="error-header-v2__text-content"]/p/text()')[0] if html.xpath('//div[@class="error-header-v2__text-content"]/p/text()') else None
                        if looked:
                            if (target_text in result for result in looked):
                                print(f'asin{asin} 已失效')
                                try:
                                    self.invalid_queue.put([idd])
                                    print(f'将{asin}放入失效队列')
                                except Exception as e:
                                    print('invalid_queue队列已满，写入失败')

                        # 没有失效就再请求一次
                        else:
                            self.response_data(asin, idd, cookie_list_copy, retry_count + 1)

                    # 重试次数耗完 放进失败队列修改 state = 3
                    else:
                        print(f"重试次数耗尽，将asin: {asin}---{idd}放进队列,")
                        try:
                            self.failed_queue.put([idd])
                        except Exception as e:
                            print('failed_queue队列已满，写入失败')
            else:
                print("Cookie list 为空")
                return

        except Exception as exc:
            print(retries_left,999999999999999999)
            if retries_left > 1:
                wait_time = 2 ** retry_count
                print(f"{asin}{exc}遇到错误，将在{wait_time}秒后重试...")
                time.sleep(wait_time)
                self.response_data(asin,idd,cookie_list_copy,retry_count + 1)
            else:
                print(f"重试次数耗尽，将asin: {asin}---{idd}放进队列, 错误: {exc}")
                try:
                    self.failed_queue.put([idd])
                except Exception as e:
                    print('failed_queue队列已满，写入失败')

    def parse_data(self, response, asin,idd):
        # print(11111111111111111)
        html = etree.HTML(response.text)
        img_url = html.xpath(
            '//link[@as="image"]/@href | //div[@class="ux-image-carousel-item image-treatment active  image"]/img/@src')[
            0] if html.xpath(
            '//link[@as="image"]/@href | //div[@class="ux-image-carousel-item image-treatment active  image"]/img/@src') else None
        if img_url:
            img_num = len(html.xpath(
                '//div[@class="x-photos-min-view filmstrip filmstrip-x"]//div[@class="ux-image-carousel zoom img-transition-medium"]//img/@data-zoom-src')) if html.xpath(
                '//div[@class="x-photos-min-view filmstrip filmstrip-x"]//div[@class="ux-image-carousel zoom img-transition-medium"]//img/@data-zoom-src') else 1
        else:
            img_num = 0
        title = \
            html.xpath('//div[@class="ux-bin-nudge__title"]/span/text() | //meta[@name="twitter:title"]/@content | //h1[@class="product-title"]/span/text() | //span[@class="ux-textspans ux-textspans--BOLD"]/text()')[0]
        title_num = len(title)
        price = html.xpath('//div[@class="x-bin-price__content"]/div/span[@class="ux-textspans"]/text()| //div[@class="x-price-primary"]/span/text()')[0] if html.xpath('//div[@class="x-bin-price__content"]/div/span[@class="ux-textspans"]/text()| //div[@class="x-price-primary"]/span/text()') else None
        # print(price)
        if price:
            price = ''.join(char for char in price if char.isdigit() or char == '.')

        category = '-'.join(html.xpath('//li/a[@class="seo-breadcrumb-text"]/span/text()'))
        total_comments = html.xpath('//h2[@class="fdbk-detail-list__title"]/span/text()')[1].split('(')[1].split(')')[
            0].replace(',', '') if html.xpath('//h2[@class="fdbk-detail-list__title"]/span/text()') else None
        rating = html.xpath('//div[@class="d-stores-info-categories__container__info__section__item"]/span[1]/text()')[
            0] if html.xpath(
            '//div[@class="d-stores-info-categories__container__info__section__item"]/span[1]/text()') else None
        volume = html.xpath('//dt//span[contains(text(), "Item dimensions L x W x H")]/../../../../dd//text()')[
            0] if html.xpath(
            '//dt//span[contains(text(), "Item dimensions L x W x H")]/../../../../dd//text()') else None
        weight = html.xpath('//dt//span[contains(text(), "Item Weight")]/../../../../dd//text() | //li/div[@class="s-name" and normalize-space(.)="Item Weight"]/../div[@class="s-value"]/text()')[0] if html.xpath('//dt//span[contains(text(), "Item Weight")]/../../../../dd//text() | //li/div[@class="s-name" and normalize-space(.)="Item Weight"]/../div[@class="s-value"]/text()') else None

        if weight == 'NA':
            weight = None
        if weight:
            weight = ''.join(char for char in weight if char.isdigit() or char == '.')
            match = re.search(r'[-+]?\d+(\.\d+)?', weight)
            if match:
                weight = match.group()
            else:
                weight = None


        current_date = datetime.now()
        date_info = f"{current_date.year}-{current_date.month:02d}"
        now = datetime.now()
        formatted_now = now.strftime('%Y-%m-%d %H:%M:%S')

        item = {}
        item['asin'] = asin
        item['img_url'] = img_url
        item['img_num'] = img_num
        item['title'] = title
        item['title_len'] = title_num
        item['price'] = price
        item['category'] = category
        item['total_comments'] = total_comments
        item['rating'] = rating
        item['volume'] = volume
        item['weight'] = weight
        item['date_info'] = date_info
        item['created_time'] = formatted_now
        print(f'解析成功：{asin}---{idd}')
        # print(item)
        data = (item, idd)
        try:
            self.item_queue.put(data)
        except Exception as exc:
            print('item_queue队列已满，写入失败')

    # 更新state 成功的state = 21 失败的state = 3 失效的state = 4
    def update(self, success_id, failed_id,invalid_id):
        print(success_id, failed_id,invalid_id, 66666666666)
        try:
            if success_id:
                Con.ebay_upstate_to_21(success_id)
                print("所有数据处理完成，状态已更新为21。")

            if failed_id:
                print(failed_id)
                Con.ebay_upstate_to_3(failed_id)
                print("所有数据处理完成，状态已更新为3。")

            if invalid_id:
                print(invalid_id)
                Con.ebay_upstate_to_4(invalid_id)
                print("所有数据处理完成，状态已更新为4。")

        except Exception as e:
            print(f"更新状态时发生错误: {e}")

    def workers(self, task_queue):
        cookie_list_copy = cookie_list.copy()

        while not task_queue.empty():
            try:
                asin,idd = task_queue.get(timeout=10)
            except Exception as e:
                print('task_queue队列为空，get失败')
            self.response_data(asin,idd,cookie_list_copy)
            task_queue.task_done()

    def run(self, asin_id_pairs):
        task_queue = queue.Queue()
        for asin_id in asin_id_pairs:
            asin = asin_id.split('|-|-|-|-|-|')[0]
            idd = asin_id.split('|-|-|-|-|-|')[1]
            try:
                task_queue.put((asin, idd))
            except Exception as exc:
                print('task_queue队列已满，写入失败')
        threads = []
        for th in range(3):
            t = threading.Thread(target=self.workers, args=(task_queue,))
            threads.append(t)
            t.start()

        for t in threads:
            t.join()

        all_items = []
        all_idds = []
        while not self.item_queue.empty():
            try:
                data = self.item_queue.get_nowait()  # 使用get_nowait避免阻塞，假设队列最终会为空
                item, idd = data
                all_items.append(item)
                all_idds.append(idd)
            except queue.Empty:
                pass  # 队列为空时正常结束循环

        try:
            Con.save_ebay_details_bulk(all_items)
            self.success_queue.put(all_idds)
        except Exception as e:
            print(f"保存时发生错误: {e}")
            self.failed_queue.put(all_idds)

        all_success_ids = []
        all_failed_ids = []
        all_invalid_ids = []
        while not self.success_queue.empty():
            all_success_ids.append(self.success_queue.get())
        while not self.failed_queue.empty():
            all_failed_ids.append(self.failed_queue.get())
        while not self.invalid_queue.empty():
            all_invalid_ids.append(self.invalid_queue.get())

        self.update(all_success_ids, all_failed_ids,all_invalid_ids)


def worker(start_id, limit):
    # print(start_id)
    asin_id_pairs = Con.get_ebay_asins(start_id, limit)
    # asin_id_pairs = Con.get_ebay_asins_21()
    print(f'{start_id}"-[][][]-"{start_id+limit},{asin_id_pairs}')
    if asin_id_pairs:
        Ebay_details().run(asin_id_pairs)


def main():
    pppoe_ip()
    start_time = datetime.now()
    for i in range(800):
        minid_maxid = Con.ebay_asin_split_wj(i)
        if minid_maxid:
            minid = minid_maxid.split('-')[0]
            maxid = minid_maxid.split('-')[1]
            print(f'第{i + 1}批次',minid, maxid)
            num_processes = 3
            batch_size = 100
            p = Pool(num_processes)
            data_range = []
            start_id = int(minid)
            while start_id <= int(maxid):
                end_id = min(start_id + batch_size - 1, int(maxid))  # 防止超出最大ID,
                data_range.append((start_id, end_id - start_id + 1))  # 计算每个范围的行数
                start_id = end_id + 1  # 更新起始ID为下一个范围的起始ID

            for start_id, limit in data_range:
                p.apply_async(worker,args=(start_id, limit))
            p.close()
            p.join()

            current_time = datetime.now()
            elapsed_time = current_time - start_time
            if elapsed_time >= timedelta(minutes=10):
                print('时间超过10分钟，切换IP')
                pppoe_ip()
                start_time = current_time  # 重置开始时间
        else:
            print('已全部获取')
            break

if __name__ == '__main__':
    main()




