127 lines
3.9 KiB
PHP
127 lines
3.9 KiB
PHP
<?php
|
|
|
|
/*
|
|
* check if string is valid in target encoding
|
|
*/
|
|
|
|
declare(strict_types=1);
|
|
|
|
namespace CoreLibs\Check;
|
|
|
|
class Encoding
|
|
{
|
|
/** @var int<min, -1>|int<1, max>|string */
|
|
private static int|string $mb_error_char = '';
|
|
|
|
/**
|
|
* set error char
|
|
*
|
|
* @param string|int|null $string The character to use to represent
|
|
* error chars
|
|
* "long" for long, "none" for none
|
|
* or a valid code point in int
|
|
* like 0x2234 (8756, ∴)
|
|
* default character is ? (63)
|
|
* if null is set then "none"
|
|
* @return void
|
|
*/
|
|
public static function setErrorChar(string|int|null $string): void
|
|
{
|
|
if (empty($string)) {
|
|
$string = 'none';
|
|
}
|
|
// if not special string or char but code point
|
|
if (in_array($string, ['none', 'long', 'entity'])) {
|
|
self::$mb_error_char = $string;
|
|
} else {
|
|
// always convert to char for internal use
|
|
self::$mb_error_char = \IntlChar::chr($string);
|
|
// if string convert to code point
|
|
if (is_string($string)) {
|
|
$string = \IntlChar::ord($string);
|
|
}
|
|
}
|
|
mb_substitute_character($string);
|
|
}
|
|
|
|
/**
|
|
* get the current set error character
|
|
*
|
|
* @param bool $return_substitute_func if set to true return the set
|
|
* character from the php function
|
|
* directly
|
|
* @return string|int Set error character
|
|
*/
|
|
public static function getErrorChar(bool $return_substitute_func = false): string|int
|
|
{
|
|
// return mb_substitute_character();
|
|
if ($return_substitute_func === true) {
|
|
// if false abort with error
|
|
if (($return = mb_substitute_character()) === false) {
|
|
return self::$mb_error_char;
|
|
}
|
|
return $return;
|
|
} else {
|
|
return self::$mb_error_char;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* test if a string can be safely convert between encodings.
|
|
* mostly utf8 to shift jis
|
|
* the default compare has a possibility of failure, especially with windows
|
|
* it is recommended to the following in the script which uses this method:
|
|
* mb_substitute_character(0x2234);
|
|
* $class->mb_error_char = '∴';
|
|
* if check to Shift JIS
|
|
* if check to ISO-2022-JP
|
|
* if check to ISO-2022-JP-MS
|
|
* set three dots (∴) as wrong character for correct convert error detect
|
|
* (this char is used, because it is one of the least used ones)
|
|
*
|
|
* @param string $string string to test
|
|
* @param string $from_encoding encoding of string to test
|
|
* @param string $to_encoding target encoding
|
|
* @return array<string>|false false if no error or
|
|
* array with failed characters
|
|
*/
|
|
public static function checkConvertEncoding(
|
|
string $string,
|
|
string $from_encoding,
|
|
string $to_encoding
|
|
): array|false {
|
|
// convert to target encoding and convert back
|
|
$temp = mb_convert_encoding($string, $to_encoding, $from_encoding);
|
|
if ($temp === false) {
|
|
return false;
|
|
}
|
|
$compare = mb_convert_encoding($temp, $from_encoding, $to_encoding);
|
|
if ($compare === false) {
|
|
return false;
|
|
}
|
|
// if string does not match anymore we have a convert problem
|
|
if ($string == $compare) {
|
|
return false;
|
|
}
|
|
$failed = [];
|
|
// go through each character and find the ones that do not match
|
|
for ($i = 0, $iMax = mb_strlen($string, $from_encoding); $i < $iMax; $i++) {
|
|
$char = mb_substr($string, $i, 1, $from_encoding);
|
|
$r_char = mb_substr($compare, $i, 1, $from_encoding);
|
|
// the ord 194 is a hack to fix the IE7/IE8
|
|
// bug with line break and illegal character
|
|
if (
|
|
(($char != $r_char && (!self::$mb_error_char ||
|
|
in_array(self::$mb_error_char, ['none', 'long', 'entity']))) ||
|
|
($char != $r_char && $r_char == self::$mb_error_char && self::$mb_error_char)) &&
|
|
ord($char) != 194
|
|
) {
|
|
$failed[] = $char;
|
|
}
|
|
}
|
|
return $failed;
|
|
}
|
|
}
|
|
|
|
// __END__
|