import sys
import os

sys.path.append(os.path.dirname(sys.path[0]))  # 上级目录

# from get_cookie import Get_cookie
import requests
import logging
logging.captureWarnings(True)
import os
from lxml import etree
import re
sess = requests.Session()
os.environ['NO_PROXY'] = 'stackoverflow.com'
from datetime import datetime, timedelta
from sqlalchemy import create_engine, delete
import pandas as pd
import json
from multiprocessing import Pool
from concurrent.futures import ThreadPoolExecutor
import time
from requests.exceptions import ConnectionError
import threading
from queue import Queue
import queue
from all_connect import ConnectSpider
from switch_ip import pppoe_ip
import random
from cookie_list import cookie_lists
from  amazon_params import py_ja3
from concurrent.futures import ThreadPoolExecutor


Con = ConnectSpider()
class Amazon_details:
    def __init__(self):
        self. headers = {
    "Host": "www.amazon.com",
    "upgrade-insecure-requests": "1",
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36",
}
        self.params = {"th": "1"}
        self.item_queue = Queue()
        self.success_queue = Queue()
        self.failed_queue = Queue()
        self.MAX_RETRIES = 3

    def response_data(self, asin,idd, cookie_str, retry_count=1):
        url = f'https://www.amazon.com/dp/{asin}'
        retries_left = self.MAX_RETRIES - retry_count

        cookie_dict = json.loads(cookie_str)
        cookie_str = ''
        for k, v in cookie_dict.items():
            cookie_str += f"{k}={v};"
        self.headers['cookie'] = cookie_str

        try:
            sess.mount('https://www.amazon.com/', py_ja3.DESAdapter())
            response = sess.get(url, headers=self.headers, timeout=30)
            html = etree.HTML(response.text)
            # 检查验证码
            if html.xpath('//h4[contains(text(),"Type the characters you see in this image:")]'):
                print(f'验证码页面 {asin}')
                self.failed_queue.put(idd)
            else:
                # 检查邮编是否为纽约10010
                new_york = html.xpath('//span[@id="glow-ingress-line2" and contains(text(),"New York 10010")]')
                if new_york:
                    self.parse_data(response, asin, idd)
                else:
                    print(f'邮编不是10010：{new_york} {asin}')
                    del cookie_lists[0]
                    new_cookie = cookie_lists[0]
                    print('cookie剩余：', len(cookie_lists))
                    if new_cookie:
                        self.response_data(asin,idd, cookie_str)

        except ConnectionError as exc:
            if retries_left > 0:
                wait_time = 2 ** retry_count
                print(f"遇到错误，将在{wait_time}秒后重试...")
                time.sleep(wait_time)
                self.response_data(asin,idd, cookie_str, retry_count + 1)
            else:
                print(f"重试次数耗尽，放弃处理ASIN: {asin}, 错误: {exc}")
                self.failed_queue.put(idd)
        except Exception as e:
            print(f"处理{asin}时发生错误: {e}")
            self.failed_queue.put(idd)

    def parse_data(self, response, asin,idd):
        try:
            html = etree.HTML(response.text)
            compressed_html_code = re.sub(r'>\s+<', '><', response.text)
            compressed_html_code = re.sub(r'\s{2,}', ' ', compressed_html_code)

            img_url = re.findall("'dp60MainImage': '(.*?)',", response.text)[0] if re.findall(
                "'dp60MainImage': '(.*?)',",
                response.text) else None
            img_num = len(html.xpath('//ul/li[@class="a-spacing-small item"]'))
            if not img_url:
                img_url = re.findall(',"main":\{"(.*?jpg)"', compressed_html_code)
                img_num = len(re.findall(',"main":\{"(.*?jpg)"', compressed_html_code))
            title = html.xpath('//div[@id="imgTagWrapperId"]/img/@alt')[0]
            title_len = len(title)
            price = html.xpath('//div[@id="apex_offerDisplay_desktop" ]//span[@class="a-offscreen"]/text()')[
                0] if html.xpath('//div[@id="apex_offerDisplay_desktop" ]//span[@class="a-offscreen"]/text()') else None

            if price and '$' in price:
                price = price.replace('$', '')

            category = html.xpath('//ul[@class="a-unordered-list a-horizontal a-size-small"]//li//text()')
            category = ''.join([item.strip() for item in category if item.strip()])
            rating = html.xpath('//a/span[@class="a-size-base a-color-base"]/text()')[0] if html.xpath(
                '//a/span[@class="a-size-base a-color-base"]/text()') else None
            total_comments = html.xpath('//a/span[@id="acrCustomerReviewText"]/text()')[0].split(' ratings')[0].replace(
                ',',
                '') if html.xpath(
                '//a/span[@id="acrCustomerReviewText"]/text()') else None
            if total_comments == '1 rating':
                total_comments = 1

            seller_type = html.xpath(
                '//div[@class="a-section a-spacing-none _p13n-desktop-sims-fbt_fbt-desktop_shipping-info-show-box__17yWM"]/div/span[@class="a-size-base a-color-secondary"]//text()')[
                0] if html.xpath(
                '//div[@class="a-section a-spacing-none _p13n-desktop-sims-fbt_fbt-desktop_shipping-info-show-box__17yWM"]/div/span[@class="a-size-base a-color-secondary"]//text()') else None
            if seller_type:
                if seller_type == 'Ships from and sold by Amazon.com.':
                    buy_box_seller_type = 1
                else:
                    sold = html.xpath('//div[@id="merchantInfoFeature_feature_div"]//a/text()')[
                        0].strip() if html.xpath(
                        '//div[@id="merchantInfoFeature_feature_div"]//a/text()') else None
                    if not sold:
                        sold = html.xpath(
                            '//div[@id="shipFromSoldByAbbreviated_feature_div"]//span[@class="a-size-small"]/text()')[
                            3].strip() if html.xpath(
                            '//div[@id="shipFromSoldByAbbreviated_feature_div"]//span[@class="a-size-small"]/text()') else None

                    ships_from = html.xpath(
                        '//div[@id="fulfillerInfoFeature_feature_div"]//span[@class="a-size-small offer-display-feature-text-message"]/text()')[
                        0].strip()
                    if not ships_from:
                        ships_from = html.xpath(
                            '//div[@id="shipFromSoldByAbbreviated_feature_div"]//span[@class="a-size-small"]/text()')[
                            1].strip() if html.xpath(
                            '//div[@id="shipFromSoldByAbbreviated_feature_div"]//span[@class="a-size-small"]/text()') else None
                    if (ships_from == 'Amazon' or ships_from == 'Amazon.com') and (
                            sold == 'Amazon' or sold == 'Amazon.com'):
                        buy_box_seller_type = 1
                    elif (ships_from == 'Amazon' or ships_from == 'Amazon.com') and (
                            sold != 'Amazon' or sold != 'Amazon.com'):
                        buy_box_seller_type = 2
                    elif (ships_from != 'Amazon' or ships_from != 'Amazon.com') and (
                            sold != 'Amazon' or sold != 'Amazon.com'):
                        buy_box_seller_type = 3
                    else:
                        buy_box_seller_type = 4
            else:
                buy_box_seller_type = 4

            if not seller_type:
                seller_type = html.xpath('//div[contains(text(),"Sold by")]/a/text()') if html.xpath(
                    '//div[contains(text(),"Sold by")]/a/text()') else None
                if seller_type:
                    buy_box_seller_type = 3
                if not seller_type:
                    seller_type = html.xpath(
                        '//div[@class="a-column a-span12 a-text-left truncate"]//text()') if html.xpath(
                        '//div[@class="a-column a-span12 a-text-left truncate"]//text()') else None
                    if seller_type:
                        seller_type = [x.strip() for x in seller_type if x.strip()]
                        ships_from = seller_type[1]
                        sold = seller_type[3]
                        if (ships_from == 'Amazon' or ships_from == 'Amazon.com') and (
                                sold == 'Amazon' or sold == 'Amazon.com'):
                            buy_box_seller_type = 1
                        elif (ships_from == 'Amazon' or ships_from == 'Amazon.com') and (
                                sold != 'Amazon' or sold != 'Amazon.com'):
                            buy_box_seller_type = 2
                        elif (ships_from != 'Amazon' or ships_from != 'Amazon.com') and (
                                sold != 'Amazon' or sold != 'Amazon.com'):
                            buy_box_seller_type = 3
                        else:
                            buy_box_seller_type = 4

            if not seller_type:
                buy_box_seller_type = 4
            # print(seller_type)
            # print(buy_box_seller_type)

            page_inventory = len(html.xpath('//div[@id="selectQuantity"]//select[@id="quantity" ]/option/text()'))
            if page_inventory > 20:
                page_inventory = 1
            elif 1 <= page_inventory <= 20:
                page_inventory = 2
            elif page_inventory < 1:
                page_inventory = 3

            volume = html.xpath(
                '//th[@class="a-color-secondary a-size-base prodDetSectionEntry" and normalize-space(.)="Product Dimensions"]/../td/text()')[
                0].strip().replace('\u200e', '') if html.xpath(
                '//th[@class="a-color-secondary a-size-base prodDetSectionEntry" and normalize-space(.)="Product Dimensions"]/../td/text()') else None
            weight = html.xpath(
                '//th[@class="a-color-secondary a-size-base prodDetSectionEntry" and normalize-space(.)="Item Weight"]/../td/text()')[
                0].strip().replace('\u200e', '') if html.xpath(
                '//th[@class="a-color-secondary a-size-base prodDetSectionEntry" and normalize-space(.)="Item Weight"]/../td/text()') else None
            if not volume and not weight:
                if html.xpath(
                        '//span[contains(text(), "Package Dimensions")]/following-sibling::span/text() | //span[contains(text(), "Product Dimensions")]/following-sibling::span/text()'):
                    volume_weight = html.xpath(
                        '//span[contains(text(), "Package Dimensions")]/following-sibling::span/text() | //span[contains(text(), "Product Dimensions")]/following-sibling::span/text()')[
                        0]
                    volume = volume_weight.split(';')[0].replace('\u200e', '')
                    weight = volume_weight.split(';')[1].replace('\u200e', '')
                else:
                    volume = None
                    weight = None

            if weight and 'ounces' in weight:
                weight = weight.split('ounces')[0].strip().replace('\u200e', '')
                weight = float(weight) / 16

            if weight and 'Grams' in weight:
                weight = weight.split('Grams')[0].strip().replace('\u200e', '')
                weight = float(weight) / 453.592

            if weight and 'Kilograms' in weight:
                weight = weight.split('Kilograms')[0].replace('\u200e', '')
                weight = float(weight) * 2.20462

            elif weight and 'pounds' in weight:
                weight = weight.split('pounds')[0].replace('\u200e', '')

            elif weight and 'Pounds' in weight:
                weight = weight.split('Pounds')[0].replace('\u200e', '')

            rank = ' '.join(html.xpath(
                '//th[@class="a-color-secondary a-size-base prodDetSectionEntry" and normalize-space(.)="Best Sellers Rank"]/../td//text()')).split(
                '#')[1].split(' ')[0] if html.xpath(
                '//th[@class="a-color-secondary a-size-base prodDetSectionEntry" and normalize-space(.)="Best Sellers Rank"]/../td//text()') else None
            if not rank:
                rank = \
                    ' '.join(html.xpath('//span[contains(text(), "Best Sellers Rank")]/../text()')).split('#')[1].split(
                        ' ')[
                        0] if html.xpath('//span[contains(text(), "Best Sellers Rank")]/../text()') else None
            if rank:
                rank = int(rank.replace(',', ''))
            launch_time = html.xpath(
                '//th[@class="a-color-secondary a-size-base prodDetSectionEntry" and normalize-space(.)="Date First Available"]/../td//text()')[
                0] if html.xpath(
                '//th[@class="a-color-secondary a-size-base prodDetSectionEntry" and normalize-space(.)="Date First Available"]/../td//text()') else None
            if not launch_time:
                launch_time = \
                    html.xpath('//span[contains(text(), "Date First Available")]/following-sibling::span/text()')[
                        0] if html.xpath(
                        '//span[contains(text(), "Date First Available")]/following-sibling::span/text()') else None
            if launch_time:
                trimmed_date_str = launch_time.strip()
                date_obj = datetime.strptime(trimmed_date_str, "%B %d, %Y")
                launch_time = date_obj.strftime("%Y-%m-%d")

            img_type = []
            if img_num != 0:
                type = 1
                img_type.append(type)
            if html.xpath(
                    '//span[@class="a-size-mini a-color-secondary video-count a-text-bold a-nowrap"and contains(text(), "VIDEOS")]'):
                type = 2
                img_type.append(type)
            if html.xpath('//div[@class="aplus-v2 desktop celwidget"]//img'):
                type = 3
                img_type.append(type)

            img_type = ', '.join(map(str, img_type))
            brand = html.xpath('//a[@id="bylineInfo"]/text()')[0] if html.xpath(
                '//a[@id="bylineInfo"]/text()') else None
            node_id = html.xpath('//div[@id="wayfinding-breadcrumbs_feature_div"]//a/@href')[-1].split('node=')[
                -1] if html.xpath('//div[@id="wayfinding-breadcrumbs_feature_div"]//a/@href') else None

            buy_sales = ' '.join(
                html.xpath('//span[@id="social-proofing-faceout-title-tk_bought"]//text()')).strip() if html.xpath(
                '//span[@id="social-proofing-faceout-title-tk_bought"]//text()') else None
            current_date = datetime.now()
            date_info = f"{current_date.year}-{current_date.month:02d}"
            created_time = datetime.now()

            item = {}
            item['asin'] = asin
            item['img_url'] = img_url
            item['img_num'] = img_num
            item['title'] = title
            item['title_len'] = title_len
            item['price'] = price
            item['category'] = category
            item['rating'] = rating
            item['total_comments'] = total_comments
            item['buy_box_seller_type'] = buy_box_seller_type
            item['page_inventory'] = page_inventory
            item['volume'] = volume
            item['weight'] = weight
            item['rank'] = rank
            item['launch_time'] = launch_time
            item['img_type'] = img_type
            item['brand'] = brand
            item['node_id'] = node_id
            item['buy_sales'] = buy_sales
            item['date_info'] = date_info
            item['created_time'] = created_time
            # print('try完成')
            # print(item)
            data = (item, idd)
            # 将打包好的数据项放入队列
            self.item_queue.put(data)
            print(f'解析成功：{asin}')

        except Exception as e:
            self.failed_queue.put(idd)
            print(f'解析出错: {asin}',e)


    def update(self,success_id, failed_id):
        try:
            if success_id:
                Con.upstate_to_21(success_id)
                print("所有数据处理完成，状态已更新为21。")
        except Exception as e:
            print(f"更新状态时发生错误: {e}")

        try:
            if failed_id:
                Con.upstate_to_3(failed_id)
                print("所有数据处理完成，状态已更新为3。")
        except Exception as e:
            print(f"更新状态时发生错误: {e}")

    def workers(self, task_queue):
        while not task_queue.empty():
            asin,idd, cookie_str = task_queue.get(timeout=3)
            try:
                self.response_data(asin,idd, cookie_str)
            except Exception as e:
                print(f"处理{asin}时出错: {e}")
                self.failed_queue.put(idd)
            finally:
                task_queue.task_done()

    def run(self, asin_id_pairs, cookie_str):
        task_queue = queue.Queue()
        for asin_id in asin_id_pairs:
            asin = asin_id.split('|-|-|-|-|-|')[0]
            idd = asin_id.split('|-|-|-|-|-|')[1]
            task_queue.put((asin,idd, cookie_str))

        threads = []
        for th in range(20):
            t = threading.Thread(target=self.workers, args=(task_queue,))
            threads.append(t)
            t.start()

        # 等待所有线程完成
        for t in threads:
            t.join()

        # 保存所有收集到的数据
        while not self.item_queue.empty():
            data = self.item_queue.get()
            item, idd = data
            try:
                Con.save_asin_detail(item)
                self.success_queue.put(idd)
            except Exception as e:
                self.failed_queue.put(idd)
            finally:
                self.item_queue.task_done()

        all_success_ids = []
        all_failed_ids = []
        while not self.success_queue.empty():
            all_success_ids.append(self.success_queue.get())
        while not self.failed_queue.empty():
            all_failed_ids.append(self.failed_queue.get())

        self.update(all_success_ids, all_failed_ids)

def worker(start_id, limit):
    asin_id_pairs = Con.us_all_syn_st_month_2024_05(start_id, limit)
    # asin_list = ['B0CBSC5ZNR','B0CBSCNSQ7','B0CBSB12FP','B0CC5LCHDC','B0CBVKVD56']
    # asin_id_pairs = ['B084NC5MQD|-|-|-|-|-|410882124', '1284044793|-|-|-|-|-|410882125', 'B084NCX6KP|-|-|-|-|-|410882126', '128419616X|-|-|-|-|-|410882127', 'B084NDLPGF|-|-|-|-|-|410882128', '1291247424|-|-|-|-|-|410882129']
    # print(asin_id_pairs)
    lists = list(cookie_lists)
    cookie_str = lists[0]
    if asin_id_pairs and cookie_str:
        Amazon_details().run(asin_id_pairs,cookie_str)

def main():
    pppoe_ip()
    start_time = datetime.now()
    for i in range(160):
        print(f'第{i + 1}批次')
        minid_maxid = Con.us_asin_split_test_wj(i)
        minid = minid_maxid.split('-')[0]
        maxid = minid_maxid.split('-')[1]
        print(minid, maxid)
        num_processes = 10
        batch_size = 200
        p = Pool(num_processes)
        data_range = []
        start_id = int(minid)
        while start_id <= int(maxid):
            # end_id 每次加200
            end_id = min(start_id + batch_size - 1, int(maxid))  # 防止超出最大ID,
            data_range.append((start_id, end_id - start_id + 1))  # 计算每个范围的行数
            start_id = end_id + 1  # 更新起始ID为下一个范围的起始ID
        # print(data_range)
        # 创建多个进程并行处理数据
        for start_id, limit in data_range:
            # print(start_id)
            p.apply_async(worker, args=(start_id, limit))
        p.close()
        p.join()

        current_time = datetime.now()
        elapsed_time = current_time - start_time
        if elapsed_time >= timedelta(minutes=5):
            print('时间超过5分钟，切换IP')
            pppoe_ip()
            start_time = current_time  # 重置开始时间

    end_time = datetime.now()
    duration = end_time - start_time
    hours, remainder = divmod(duration.seconds, 3600)
    minutes, seconds = divmod(remainder, 60)
    print(f'项目总共耗时: {hours}小时 {minutes}分钟 {seconds}秒')

# if __name__ == '__main__':
#     main()


weight = '2.4 ounces'
if weight and 'ounces' in weight:
    weight = weight.split('ounces')[0].strip().replace('\u200e', '')
    weight = float(weight) / 16
print(weight)