Move Encoding class away from the Language namespace

Language\Encoding::__mbMimeEncode -> Convert\MimeEncode::__mbMimeEncode
Langauge\Encoding::checkConvertEncoding -> Check\Encoding::checkConvertEncoding
Langauge\Encoding::setErrorChar -> Check\Encoding::setErrorChar
Langauge\Encoding::getErrorChar -> Encoding::getErrorChar
Langauge\Encoding::convertEncoding -> Convert\Encoding::convertEncoding

Also fixed encoding check that not only a code point but a string can
also be used as a parameter.

Update phpunit tests and split them out for each class

Normal test page is still combined for all classes but updated to
correctly use each class
This commit is contained in:
Clemens Schwaighofer
2022-04-13 09:25:42 +09:00
parent a3c49e408a
commit 6f4c5e36e6
12 changed files with 618 additions and 401 deletions

View File

@@ -0,0 +1,117 @@
<?php
/*
* check if string is valid in target encoding
*/
declare(strict_types=1);
namespace CoreLibs\Check;
class Encoding
{
/** @var int<min, -1>|int<1, max>|string */
private static $mb_error_char = '';
/**
* set error char
*
* @param string|int|null $string The character to use to represent
* error chars
* "long" for long, "none" for none
* or a valid code point in int
* like 0x2234 (8756, ∴)
* default character is ? (63)
* if null is set then "none"
* @return void
*/
public static function setErrorChar($string): void
{
if (empty($string)) {
$string = 'none';
}
// if not special string or char but code point
if (in_array($string, ['none', 'long', 'entity'])) {
self::$mb_error_char = $string;
} else {
// always convert to char for internal use
self::$mb_error_char = \IntlChar::chr($string);
// if string convert to code point
if (is_string($string)) {
$string = \IntlChar::ord($string);
}
}
mb_substitute_character($string);
}
/**
* get the current set error character
*
* @param bool $return_substitute_func if set to true return the set
* character from the php function
* directly
* @return string|int Set error character
*/
public static function getErrorChar(bool $return_substitute_func = false)
{
// return mb_substitute_character();
if ($return_substitute_func === true) {
return mb_substitute_character();
} else {
return self::$mb_error_char;
}
}
/**
* test if a string can be safely convert between encodings.
* mostly utf8 to shift jis
* the default compare has a possibility of failure, especially with windows
* it is recommended to the following in the script which uses this method:
* mb_substitute_character(0x2234);
* $class->mb_error_char = '∴';
* if check to Shift JIS
* if check to ISO-2022-JP
* if check to ISO-2022-JP-MS
* set three dots (∴) as wrong character for correct convert error detect
* (this char is used, because it is one of the least used ones)
*
* @param string $string string to test
* @param string $from_encoding encoding of string to test
* @param string $to_encoding target encoding
* @return bool|array<string> false if no error or
* array with failed characters
*/
public static function checkConvertEncoding(
string $string,
string $from_encoding,
string $to_encoding
) {
// convert to target encoding and convert back
$temp = mb_convert_encoding($string, $to_encoding, $from_encoding);
$compare = mb_convert_encoding($temp, $from_encoding, $to_encoding);
// if string does not match anymore we have a convert problem
if ($string != $compare) {
$failed = [];
// go through each character and find the ones that do not match
for ($i = 0, $iMax = mb_strlen($string, $from_encoding); $i < $iMax; $i++) {
$char = mb_substr($string, $i, 1, $from_encoding);
$r_char = mb_substr($compare, $i, 1, $from_encoding);
// the ord 194 is a hack to fix the IE7/IE8
// bug with line break and illegal character
if (
(($char != $r_char && (!self::$mb_error_char ||
in_array(self::$mb_error_char, ['none', 'long', 'entity']))) ||
($char != $r_char && $r_char == self::$mb_error_char && self::$mb_error_char)) &&
ord($char) != 194
) {
$failed[] = $char;
}
}
return $failed;
} else {
return false;
}
}
}
// __END__