import os
import json
import html as html_module
from lxml import etree

root_dir = r'C:\Users\ASUS\Desktop\新建文件夹\新建文件夹 (2)'

for fname in os.listdir(root_dir):
    if not fname.lower().endswith('.html'):
        continue
    file_path = os.path.join(root_dir, fname)
    if not os.path.isfile(file_path):
        continue
    # 1) 读取并解析 HTML
    print(file_path)
    with open(file_path, 'r', encoding='utf-8') as f:
        content = f.read()
    response_s = etree.HTML(content)
    # 分别抓取所有 carousel 的 data-options 和它们的标题 h2
    data_options_list = response_s.xpath(
        "//div[@data-marketplaceid='ATVPDKIKX0DER']/@data-a-carousel-options")
    h2_list = response_s.xpath("//div[@data-marketplaceid='ATVPDKIKX0DER']//h2/text()")
    result = {}
    result_sp = {}
    result_list = []
    # Customers also search us_B0D4QGW5RX.html
    data_sp_list = response_s.xpath(
        "//div[@class='a-column a-span8']/h2[contains(@class,'carousel-heading')]/text()")
    for sp_h2 in data_sp_list:
        print(sp_h2)
        if sp_h2 != 'Videos':
            data_sp = response_s.xpath(
                f"""//div[@class='a-column a-span8']/h2[contains(text(),"{sp_h2}")]/parent::div/parent::div/parent::div/parent::div/@data-a-carousel-options""")
            if data_sp:
                decoded_sp = html_module.unescape(data_sp[0])
                decoded_sp = json.loads(decoded_sp)
                if decoded_sp.get('ajax'):
                    inner_sp_h2_list = decoded_sp.get('ajax', {}).get('id_list', [])
                    sp_h2_asin_list = [item.split('|')[0] for item in inner_sp_h2_list]
                    if sp_h2_asin_list:
                        result_sp[sp_h2] = sp_h2_asin_list
    if result_sp:
        result_list.append(result_sp)
    if h2_list and data_options_list:
        count = min(len(data_options_list), len(h2_list))
        for i in range(count):
            raw_json_str = data_options_list[i]
            title = h2_list[i].strip()
            # 解码 → 解析 → 提取 id 列表
            decoded = html_module.unescape(raw_json_str)
            outer = json.loads(decoded)
            inner_list = outer.get('ajax', {}).get('id_list', [])
            asin_list = [json.loads(item)['id'] for item in inner_list]
            result[title] = asin_list
        if result:
            result_list.append(result)
    h2_str_list = response_s.xpath(
        '//h2[contains(@class,"a-spacing-medium")]/text()|//div[@class="a-column a-span8"]/h2[contains(@class,"carousel-heading")]/text()')
    if h2_str_list:
        for h2_str in h2_str_list:
            if h2_str != 'Videos':
                data_asin_list = response_s.xpath(
                    f"""//h2[contains(text(),"{h2_str}")]/parent::div/parent::div//@data-asin|//h2[contains(text(),"{h2_str}")]/parent::div/parent::div/parent::div//@data-asin""")
                print('h2_str_list::', h2_str, data_asin_list)
                if data_asin_list:
                    result[h2_str] = data_asin_list
                    result_list.append(result)
    print('result_list 广告流量ASIN:', result_list)
    if result_list:
        result_list_json = json.dumps(result_list, ensure_ascii=False)
    else:
        result_list_json = None


