import pandas as pd
import spacy
import ginza
import xml.etree.ElementTree as ET
import re
import sys
import logging
import os

# --- グローバル設定 ---

logging.getLogger("spacy").setLevel(logging.ERROR)
try:
    nlp = spacy.load('ja_ginza')
    print("spaCy model 'ja_ginza' loaded successfully.")
except OSError:
    print("Error: Could not load model 'ja_ginza'.")
    print("Please make sure you have run: pip install ginza-jp")
    sys.exit(1)

SENTENCE_ATTRIBUTES = [
    'レジスター', 'コア', '固定長', '可変長', '執筆者', '生年代', '性別',
    'ジャンル', '書名/出典', '副題/分類', '巻号', '編著者等', '出版者',
    '出版年', '話者名', '話者性別', '話者年齢層'
]

reading_error_warned = False

# --- ヘルパー関数 ---

def clean_xml_text(text):
    if not isinstance(text, str):
        text = str(text)
    text = text.replace('\u0000', '')
    return re.sub(
        r'[\u0001-\u0008\u000B\u000C\u000E-\u001F\u007F-\u0084\u0086-\u009F]',
        '',
        text
    )

# ★★★ 新しいヘルパー関数 ★★★
def clean_xml_attribute_name(attr_name):
    """XML属性名として無効な文字を置換する"""
    if not isinstance(attr_name, str):
        attr_name = str(attr_name)
    
    # スラッシュ '/' を アンダースコア '_' に置換
    attr_name = attr_name.replace('/', '_')
    # 括弧 '()' を除去 (または _ に置換)
    attr_name = attr_name.replace('(', '').replace(')', '')
    # スペース ' ' を アンダースコア '_' に置換
    attr_name = attr_name.replace(' ', '_')
    # 中黒 '・' を アンダースコア '_' に置換
    attr_name = attr_name.replace('・', '_')
    
    # XML属性名は数字で始まってはいけないルールへの対応（もしあれば）
    # if attr_name and attr_name[0].isdigit():
    #     attr_name = "_" + attr_name
        
    return attr_name
# ★★★★★★★★★★★★★★★★★★★

def load_csv(filepath):
    try:
        return pd.read_csv(filepath, encoding='utf-8')
    except UnicodeDecodeError:
        print(f"Warning: Failed to read {filepath} as UTF-8. Trying CP932 (Shift_JIS)...")
        try:
            return pd.read_csv(filepath, encoding='cp932')
        except Exception as e:
            print(f"Error: Failed to read CSV file {filepath} with both UTF-8 and CP932.")
            print(f"Details: {e}")
            return None
    except Exception as e:
        print(f"Error reading CSV file {filepath}: {e}")
        return None

# --- メイン処理 ---

def process_data(df):
    global reading_error_warned

    df['LeftContext'] = df['前文脈'].astype(str).str.split('#').str[-1]
    df['RightContext'] = df['後文脈'].astype(str).str.split('#').str[0]
    df['Sentence'] = (
        df['LeftContext'] + df['キー'].astype(str) + df['RightContext']
    ).str.replace(r'|', '', regex=False)

    print("Columns 'LeftContext', 'RightContext', 'Sentence' (with '|' removed) created.")

    root = ET.Element('document')

    valid_attrs = [col for col in SENTENCE_ATTRIBUTES if col in df.columns]
    if not valid_attrs:
        print("Warning: None of the specified sentence attribute columns found in the CSV.")
    else:
        print(f"Using attributes for <s> tag (original CSV names): {valid_attrs}")
        df[valid_attrs] = df[valid_attrs].fillna('')

    total_rows = len(df)
    print(f"Processing {total_rows} sentences for XML conversion...")

    for i, row in df.iterrows():
        
        s = ET.SubElement(root, 's')
        
        # --- ★ 修正箇所 ★ ---
        # CSVファイルから取得した属性情報を設定
        for attr_name in valid_attrs:
            # CSVの列名 (attr_name) をXML属性用に変換
            xml_attr_name = clean_xml_attribute_name(attr_name) 
            
            attr_value = clean_xml_text(row[attr_name])
            
            # 修正した属性名 (xml_attr_name) を使用
            s.set(xml_attr_name, attr_value)
        # --- ★★★★★★★★★★★ ---

        sentence_text = clean_xml_text(row['Sentence'])
        
        try:
            doc = nlp(sentence_text)
            
            for token in doc:
                udFrom = token.head.i + 1
                udTo_list = [str(child.i + 1) for child in token.children]
                udTo = "/".join(udTo_list) if udTo_list else ""

                reading = None
                try:
                    reading = token._.reading
                except AttributeError:
                    if not reading_error_warned:
                        print("\n---")
                        print(f"Warning: [E046] token._.reading attribute not found. Falling back to token.lemma_.")
                        print("(This warning will only be shown once.)")
                        print("---\n")
                        reading_error_warned = True
                    pass 
                
                if not reading:
                    reading = token.lemma_

                w = ET.SubElement(s, 'w')
                w.set('wordId', str(token.i + 1))
                w.set('pos', clean_xml_text(token.pos_))
                w.set('lemma', clean_xml_text(token.lemma_))
                w.set('tag', clean_xml_text(token.tag_))
                w.set('udType', clean_xml_text(token.dep_))
                w.set('udFrom', str(udFrom))
                w.set('udTo', udTo)
                w.set('original', clean_xml_text(token.orth_))
                w.text = clean_xml_text(reading)

        except Exception as e:
            print(f"Error processing sentence in row {i} (Text: '{sentence_text[:50]}...'): {e}")
            s.set("ProcessingError", str(e))

        if (i + 1) % 100 == 0 or (i + 1) == total_rows:
            print(f"  ... processed {i + 1}/{total_rows} rows.")

    return root

def main():
    
    # --- ファイルパスの設定 ---
    # ★ご自身の環境に合わせて修正してください
    INPUT_DIRECTORY = r"C:\Users\akita\Downloads" 
    OUTPUT_DIRECTORY = INPUT_DIRECTORY 
    
    INPUT_FILENAME = "tumu.csv"
    OUTPUT_FILENAME = "tumu_analysis.xml"
    
    input_filepath = os.path.join(INPUT_DIRECTORY, INPUT_FILENAME)
    output_filepath = os.path.join(OUTPUT_DIRECTORY, OUTPUT_FILENAME)
    # --- ---

    print(f"Checking for input file at: {input_filepath}...")

    if not os.path.exists(input_filepath):
        print(f"Error: File not found at the specified path: {input_filepath}")
        return
        
    if not os.path.exists(OUTPUT_DIRECTORY):
        print(f"Output directory not found: {OUTPUT_DIRECTORY}")
        try:
            os.makedirs(OUTPUT_DIRECTORY)
            print(f"Created output directory: {OUTPUT_DIRECTORY}")
        except Exception as e:
            print(f"Error: Could not create output directory: {e}")
            return

    print(f"Loading CSV file: {input_filepath}...")
    df = load_csv(input_filepath)
    
    if df is None:
        return

    print("CSV loaded. Starting processing...")
    
    required_cols = ['前文脈', 'キー', '後文脈']
    if not all(col in df.columns for col in required_cols):
        print(f"Error: Missing one or more required columns: {required_cols}")
        print(f"Available columns: {df.columns.tolist()}")
        return

    xml_root = process_data(df)
    
    print(f"Writing XML to {output_filepath}...") 
    tree = ET.ElementTree(xml_root)
    
    try:
        ET.indent(tree, space="    ")
    except AttributeError:
        print("Warning: ET.indent not available (requires Python 3.9+). XML will not be pretty-printed.")
        
    try:
        tree.write(output_filepath, encoding='utf-8', xml_declaration=True) 
        print(f"\nSuccessfully generated XML file at: {output_filepath}")
    except Exception as e:
        print(f"\nError writing XML file to {output_filepath}: {e}")

if __name__ == "__main__":
    main()