Add checks for BOM encoding in files

This commit is contained in:
Clemens Schwaighofer
2025-11-06 18:21:32 +09:00
parent 0e6331fa6a
commit 4e78d83092
5 changed files with 656 additions and 0 deletions

View File

@@ -0,0 +1,75 @@
"""
File check if BOM encoded, needed for CSV load
"""
from pathlib import Path
from typing import TypedDict
class BomEncodingInfo(TypedDict):
"""BOM encoding info"""
has_bom: bool
bom_type: str | None
encoding: str | None
bom_length: int
bom_pattern: bytes | None
def is_bom_encoded(file_path: Path) -> bool:
"""
Detect if a file is BOM encoded
Args:
file_path (str): Path to the file to check
Returns:
bool: True if file has BOM, False otherwise
"""
return is_bom_encoded_info(file_path)['has_bom']
def is_bom_encoded_info(file_path: Path) -> BomEncodingInfo:
"""
Enhanced BOM detection with additional file analysis
Args:
file_path (str): Path to the file to check
Returns:
dict: Comprehensive BOM and encoding information
"""
try:
# Read first 1024 bytes for analysis
with open(file_path, 'rb') as f:
header = f.read(4)
bom_patterns = {
b'\xef\xbb\xbf': ('UTF-8', 'utf-8', 3),
b'\xff\xfe\x00\x00': ('UTF-32 LE', 'utf-32-le', 4),
b'\x00\x00\xfe\xff': ('UTF-32 BE', 'utf-32-be', 4),
b'\xff\xfe': ('UTF-16 LE', 'utf-16-le', 2),
b'\xfe\xff': ('UTF-16 BE', 'utf-16-be', 2),
}
for bom_pattern, (encoding_name, encoding, length) in bom_patterns.items():
if header.startswith(bom_pattern):
return {
'has_bom': True,
'bom_type': encoding_name,
'encoding': encoding,
'bom_length': length,
'bom_pattern': bom_pattern
}
return {
'has_bom': False,
'bom_type': None,
'encoding': None,
'bom_length': 0,
'bom_pattern': None
}
except Exception as e:
raise ValueError(f"Error checking BOM encoding: {e}") from e
# __END__