from bs4 import BeautifulSoup
import re
from sqlalchemy import create_engine
import pandas as pd
import json
from bs4 import BeautifulSoup, NavigableString, Tag

engine_us_mysql = create_engine(
    f'mysql+pymysql://XP_Yswg2025_PY:Gd1pGJog1ysLMLBdML8w81@rm-wz9yg9bsb2zf01ea4yo.mysql.rds.aliyuncs.com:3306/selection?charset=utf8mb4')

is_not_table = False


def get_h2_all_text(table):
    # # 查找所有的 h2 标签 版本1 不包括 图片链接
    # h2_tags = soup.find_all(f"{table}")
    # result = []
    # for h2 in h2_tags:
    #     h2_text = h2.get_text(strip=True)
    #     texts = []
    #     # 遍历当前 h2 之后的所有兄弟节点，直到遇到下一个 h2 标签
    #     for sibling in h2.find_next_siblings():
    #         if sibling.name == f"{table}":
    #             break
    #         # 获取该标签及其子标签的全部文本（去除首尾空白）
    #         text = sibling.get_text()
    #         cleaned_text = re.sub(r'\s+', ' ', text).strip()
    #         # 移除不需要的特定文本
    #         if cleaned_text:
    #             texts.append(cleaned_text)
    #     result.append({"name": h2_text, "text": texts})
    # return result

    # 查找所有的 h2 标签  版本二 同时获取图片链接。放到对应文本上
    h2_tags = soup.find_all(f"{table}")
    result = []
    for h2 in h2_tags:
        h2_text = h2.get_text(strip=True)
        texts = []
        # 遍历当前 h2 后面的兄弟节点，直到遇到下一个 h2 标签
        for sibling in h2.next_siblings:
            # for sibling in h2.find_next_siblings():
            # if sibling.name == "h2":
            #     break
            # # 对每个兄弟节点，先查找所有 img 标签，将其替换成包含 src 链接的文本
            # for img in sibling.find_all("img"):
            #     img_src = img.get("src")
            #     # 用 Image: 链接 来替换图片标签
            #     replacement = f" {img_src} "
            #     img.replace_with(replacement)
            # # 提取修改后的文本，并清洗多余空白字符
            # raw_text = sibling.get_text()
            # cleaned_text = re.sub(r'\s+', ' ', raw_text).strip()
            # if cleaned_text:
            #     texts.append(cleaned_text)
            # 处理文本节点（直接添加非空字符串）
            # 如果遇到下一个 h2 标签，则结束本次循环
            if isinstance(sibling, Tag) and sibling.name == "h2":
                break
            if isinstance(sibling, NavigableString):
                t = sibling.strip()
                if t:
                    texts.append(t)
            # 处理标签节点
            elif isinstance(sibling, Tag):
                # 如果该节点内有 img 标签，则先替换为 [Image: src]
                for img in sibling.find_all("img"):
                    img_src = img.get("src")
                    replacement = f" [Image: {img_src}] "
                    img.replace_with(replacement)
                raw_text = sibling.get_text()
                # 清洗连续空白字符
                cleaned_text = re.sub(r"\s+", " ", raw_text).strip()
                if cleaned_text:
                    texts.append(cleaned_text)
        if len(h2_text) == 0 and texts:
            global is_not_table
            is_not_table = True
        else:
            result.append({"name": h2_text, "text": texts})
    return result


sql = 'select category,outer_html from seller_help_html'
df = pd.read_sql(sql, con=engine_us_mysql)
category_data_list = []
for index, row in df.iterrows():
    category = row['category']
    category_name = re.sub(r'\s+', ' ', category).strip()
    html_content = row['outer_html']
    soup = BeautifulSoup(html_content, "html.parser")
    # 移除不需要的标签：Top 按钮 和 "Was this article helpful?" 模块
    top_button = soup.find("kat-button", class_="hh-scroll-to-top-box")
    if top_button:
        top_button.decompose()
    helpful_div = soup.find("div", class_="help-hmd")
    if helpful_div:
        helpful_div.decompose()
    result = get_h2_all_text('h2')
    if len(result) == 0:
        result = get_h2_all_text('h1')
    elif len(result) == 1:
        results = get_h2_all_text('h1')
        result.extend(results)
    if is_not_table:
        results = get_h2_all_text('h1')
        result.extend(results)
        is_not_table = False
    # category_data_list.append(result)
    print(result)
    result_json = json.dumps(result)
    category_data_list.append([category_name, result_json, html_content])
    # break

#
# data_list = []
# for category_data in category_data_list:
#     for category in category_data:
#         if category['text']and category['name']:
#             data_list.append([category['name'],category['text'][0]])
#
# print(data_list)
# df_seller_asin_account = pd.DataFrame(data=data_list,columns=['category_name','text'])
# df_seller_asin_account.to_csv(r'C:\Users\ASUS\Downloads\seller_help_data.csv', index=False, encoding='utf-8-sig')

# df_seller_asin_account = pd.DataFrame(data=category_data_list, columns=['category_name', 'result_json', 'outer_html'])
# df_seller_asin_account.to_sql('seller_help_data', con=engine_us_mysql, if_exists='append', index=False)
