Add checks for BOM encoding in files
This commit is contained in:
75
src/corelibs/file_handling/file_bom_encoding.py
Normal file
75
src/corelibs/file_handling/file_bom_encoding.py
Normal file
@@ -0,0 +1,75 @@
|
||||
"""
|
||||
File check if BOM encoded, needed for CSV load
|
||||
"""
|
||||
|
||||
from pathlib import Path
|
||||
from typing import TypedDict
|
||||
|
||||
|
||||
class BomEncodingInfo(TypedDict):
|
||||
"""BOM encoding info"""
|
||||
has_bom: bool
|
||||
bom_type: str | None
|
||||
encoding: str | None
|
||||
bom_length: int
|
||||
bom_pattern: bytes | None
|
||||
|
||||
|
||||
def is_bom_encoded(file_path: Path) -> bool:
|
||||
"""
|
||||
Detect if a file is BOM encoded
|
||||
|
||||
Args:
|
||||
file_path (str): Path to the file to check
|
||||
|
||||
Returns:
|
||||
bool: True if file has BOM, False otherwise
|
||||
"""
|
||||
return is_bom_encoded_info(file_path)['has_bom']
|
||||
|
||||
|
||||
def is_bom_encoded_info(file_path: Path) -> BomEncodingInfo:
|
||||
"""
|
||||
Enhanced BOM detection with additional file analysis
|
||||
|
||||
Args:
|
||||
file_path (str): Path to the file to check
|
||||
|
||||
Returns:
|
||||
dict: Comprehensive BOM and encoding information
|
||||
"""
|
||||
try:
|
||||
# Read first 1024 bytes for analysis
|
||||
with open(file_path, 'rb') as f:
|
||||
header = f.read(4)
|
||||
|
||||
bom_patterns = {
|
||||
b'\xef\xbb\xbf': ('UTF-8', 'utf-8', 3),
|
||||
b'\xff\xfe\x00\x00': ('UTF-32 LE', 'utf-32-le', 4),
|
||||
b'\x00\x00\xfe\xff': ('UTF-32 BE', 'utf-32-be', 4),
|
||||
b'\xff\xfe': ('UTF-16 LE', 'utf-16-le', 2),
|
||||
b'\xfe\xff': ('UTF-16 BE', 'utf-16-be', 2),
|
||||
}
|
||||
|
||||
for bom_pattern, (encoding_name, encoding, length) in bom_patterns.items():
|
||||
if header.startswith(bom_pattern):
|
||||
return {
|
||||
'has_bom': True,
|
||||
'bom_type': encoding_name,
|
||||
'encoding': encoding,
|
||||
'bom_length': length,
|
||||
'bom_pattern': bom_pattern
|
||||
}
|
||||
|
||||
return {
|
||||
'has_bom': False,
|
||||
'bom_type': None,
|
||||
'encoding': None,
|
||||
'bom_length': 0,
|
||||
'bom_pattern': None
|
||||
}
|
||||
except Exception as e:
|
||||
raise ValueError(f"Error checking BOM encoding: {e}") from e
|
||||
|
||||
|
||||
# __END__
|
||||
Reference in New Issue
Block a user