import pandas as pd import spacy import ginza import xml.etree.ElementTree as ET import re import sys import logging import os # --- グローバル設定 --- logging.getLogger("spacy").setLevel(logging.ERROR) try: nlp = spacy.load('ja_ginza') print("spaCy model 'ja_ginza' loaded successfully.") except OSError: print("Error: Could not load model 'ja_ginza'.") print("Please make sure you have run: pip install ginza-jp") sys.exit(1) SENTENCE_ATTRIBUTES = [ 'レジスター', 'コア', '固定長', '可変長', '執筆者', '生年代', '性別', 'ジャンル', '書名/出典', '副題/分類', '巻号', '編著者等', '出版者', '出版年', '話者名', '話者性別', '話者年齢層' ] reading_error_warned = False # --- ヘルパー関数 --- def clean_xml_text(text): if not isinstance(text, str): text = str(text) text = text.replace('\u0000', '') return re.sub( r'[\u0001-\u0008\u000B\u000C\u000E-\u001F\u007F-\u0084\u0086-\u009F]', '', text ) # ★★★ 新しいヘルパー関数 ★★★ def clean_xml_attribute_name(attr_name): """XML属性名として無効な文字を置換する""" if not isinstance(attr_name, str): attr_name = str(attr_name) # スラッシュ '/' を アンダースコア '_' に置換 attr_name = attr_name.replace('/', '_') # 括弧 '()' を除去 (または _ に置換) attr_name = attr_name.replace('(', '').replace(')', '') # スペース ' ' を アンダースコア '_' に置換 attr_name = attr_name.replace(' ', '_') # 中黒 '・' を アンダースコア '_' に置換 attr_name = attr_name.replace('・', '_') # XML属性名は数字で始まってはいけないルールへの対応(もしあれば) # if attr_name and attr_name[0].isdigit(): # attr_name = "_" + attr_name return attr_name # ★★★★★★★★★★★★★★★★★★★ def load_csv(filepath): try: return pd.read_csv(filepath, encoding='utf-8') except UnicodeDecodeError: print(f"Warning: Failed to read {filepath} as UTF-8. Trying CP932 (Shift_JIS)...") try: return pd.read_csv(filepath, encoding='cp932') except Exception as e: print(f"Error: Failed to read CSV file {filepath} with both UTF-8 and CP932.") print(f"Details: {e}") return None except Exception as e: print(f"Error reading CSV file {filepath}: {e}") return None # --- メイン処理 --- def process_data(df): global reading_error_warned df['LeftContext'] = df['前文脈'].astype(str).str.split('#').str[-1] df['RightContext'] = df['後文脈'].astype(str).str.split('#').str[0] df['Sentence'] = ( df['LeftContext'] + df['キー'].astype(str) + df['RightContext'] ).str.replace(r'|', '', regex=False) print("Columns 'LeftContext', 'RightContext', 'Sentence' (with '|' removed) created.") root = ET.Element('document') valid_attrs = [col for col in SENTENCE_ATTRIBUTES if col in df.columns] if not valid_attrs: print("Warning: None of the specified sentence attribute columns found in the CSV.") else: print(f"Using attributes for tag (original CSV names): {valid_attrs}") df[valid_attrs] = df[valid_attrs].fillna('') total_rows = len(df) print(f"Processing {total_rows} sentences for XML conversion...") for i, row in df.iterrows(): s = ET.SubElement(root, 's') # --- ★ 修正箇所 ★ --- # CSVファイルから取得した属性情報を設定 for attr_name in valid_attrs: # CSVの列名 (attr_name) をXML属性用に変換 xml_attr_name = clean_xml_attribute_name(attr_name) attr_value = clean_xml_text(row[attr_name]) # 修正した属性名 (xml_attr_name) を使用 s.set(xml_attr_name, attr_value) # --- ★★★★★★★★★★★ --- sentence_text = clean_xml_text(row['Sentence']) try: doc = nlp(sentence_text) for token in doc: udFrom = token.head.i + 1 udTo_list = [str(child.i + 1) for child in token.children] udTo = "/".join(udTo_list) if udTo_list else "" reading = None try: reading = token._.reading except AttributeError: if not reading_error_warned: print("\n---") print(f"Warning: [E046] token._.reading attribute not found. Falling back to token.lemma_.") print("(This warning will only be shown once.)") print("---\n") reading_error_warned = True pass if not reading: reading = token.lemma_ w = ET.SubElement(s, 'w') w.set('wordId', str(token.i + 1)) w.set('pos', clean_xml_text(token.pos_)) w.set('lemma', clean_xml_text(token.lemma_)) w.set('tag', clean_xml_text(token.tag_)) w.set('udType', clean_xml_text(token.dep_)) w.set('udFrom', str(udFrom)) w.set('udTo', udTo) w.set('original', clean_xml_text(token.orth_)) w.text = clean_xml_text(reading) except Exception as e: print(f"Error processing sentence in row {i} (Text: '{sentence_text[:50]}...'): {e}") s.set("ProcessingError", str(e)) if (i + 1) % 100 == 0 or (i + 1) == total_rows: print(f" ... processed {i + 1}/{total_rows} rows.") return root def main(): # --- ファイルパスの設定 --- # ★ご自身の環境に合わせて修正してください INPUT_DIRECTORY = r"C:\Users\akita\Downloads" OUTPUT_DIRECTORY = INPUT_DIRECTORY INPUT_FILENAME = "tumu.csv" OUTPUT_FILENAME = "tumu_analysis.xml" input_filepath = os.path.join(INPUT_DIRECTORY, INPUT_FILENAME) output_filepath = os.path.join(OUTPUT_DIRECTORY, OUTPUT_FILENAME) # --- --- print(f"Checking for input file at: {input_filepath}...") if not os.path.exists(input_filepath): print(f"Error: File not found at the specified path: {input_filepath}") return if not os.path.exists(OUTPUT_DIRECTORY): print(f"Output directory not found: {OUTPUT_DIRECTORY}") try: os.makedirs(OUTPUT_DIRECTORY) print(f"Created output directory: {OUTPUT_DIRECTORY}") except Exception as e: print(f"Error: Could not create output directory: {e}") return print(f"Loading CSV file: {input_filepath}...") df = load_csv(input_filepath) if df is None: return print("CSV loaded. Starting processing...") required_cols = ['前文脈', 'キー', '後文脈'] if not all(col in df.columns for col in required_cols): print(f"Error: Missing one or more required columns: {required_cols}") print(f"Available columns: {df.columns.tolist()}") return xml_root = process_data(df) print(f"Writing XML to {output_filepath}...") tree = ET.ElementTree(xml_root) try: ET.indent(tree, space=" ") except AttributeError: print("Warning: ET.indent not available (requires Python 3.9+). XML will not be pretty-printed.") try: tree.write(output_filepath, encoding='utf-8', xml_declaration=True) print(f"\nSuccessfully generated XML file at: {output_filepath}") except Exception as e: print(f"\nError writing XML file to {output_filepath}: {e}") if __name__ == "__main__": main()