Update on the CSV helper class with UTF detection for BOM reading

2025-12-16 18:53:16 +09:00
parent 11a75d8532
commit cf575ded90
1 changed files with 17 additions and 2 deletions
--- a/src/corelibs/csv_handling/csv_interface.py
+++ b/src/corelibs/csv_handling/csv_interface.py
@@ -7,10 +7,13 @@ from typing import Any, Sequence
 from pathlib import Path
 from collections import Counter
 import csv
+from corelibs.file_handling.file_bom_encoding import is_bom_encoded, is_bom_encoded_info
 from corelibs.exceptions.csv_exceptions import (
    NoCsvReader, CompulsoryCsvHeaderCheckFailed, CsvHeaderDataMissing
 )

+ENCODING = 'utf-8'
+ENCODING_UTF8_SIG = 'utf-8-sig'
 DELIMITER = ","
 QUOTECHAR = '"'
 # type: _QuotingType
@@ -27,6 +30,7 @@ class CsvWriter:
        file_name: Path,
        header_mapping: dict[str, str],
        header_order: list[str] | None = None,
+        encoding: str = ENCODING,
        delimiter: str = DELIMITER,
        quotechar: str = QUOTECHAR,
        quoting: Any = QUOTING,
@@ -38,6 +42,7 @@ class CsvWriter:
        self.__delimiter = delimiter
        self.__quotechar = quotechar
        self.__quoting = quoting
+        self.__encoding = encoding
        self.csv_file_writer = self.__open_csv(header_order)

    def __open_csv(self, header_order: list[str] | None) -> csv.DictWriter[str]:
@@ -69,7 +74,8 @@ class CsvWriter:
        try:
            fp = open(
                self.__file_name,
-                "w", encoding="utf-8"
+                "w",
+                encoding=self.__encoding
            )
            csv_file_writer = csv.DictWriter(
                fp,
@@ -109,6 +115,7 @@ class CsvReader:
        self,
        file_name: Path,
        header_check: Sequence[str] | None = None,
+        encoding: str = ENCODING,
        delimiter: str = DELIMITER,
        quotechar: str = QUOTECHAR,
        quoting: Any = QUOTING,
@@ -118,6 +125,7 @@ class CsvReader:
        self.__delimiter = delimiter
        self.__quotechar = quotechar
        self.__quoting = quoting
+        self.__encoding = encoding
        self.header: Sequence[str] | None = None
        self.csv_file_reader = self.__open_csv()

@@ -129,9 +137,16 @@ class CsvReader:
            csv.DictReader | None: _description_
        """
        try:
+            # if UTF style check if this is BOM
+            if self.__encoding.lower().startswith('utf-') and is_bom_encoded(self.__file_name):
+                bom_info = is_bom_encoded_info(self.__file_name)
+                if bom_info['encoding'] == 'utf-8':
+                    self.__encoding = ENCODING_UTF8_SIG
+                else:
+                    self.__encoding = bom_info['encoding'] or self.__encoding
            fp = open(
                self.__file_name,
-                "r", encoding="utf-8"
+                "r", encoding=self.__encoding
            )
            csv_file_reader = csv.DictReader(
                fp,