Web components
File detail
Source code
<?php
/**
* @file CharsetDetector.class.php
* Definition of CharsetDetector class group.
* @ingroup CharsetDetector
* @author Kamil Dudka <xdudka00@gmail.com>
* @date 2008-01-30
*/
/**
* @defgroup CharsetDetector
* At http://dudka.cz/CharsetDetector may be available abstract, documentation
* and tutorial for this component.\ See CharsetDetector class, if you don't
* know where to begin.
*/
/**
* Inteligent charset detector - basic interface.
* @ingroup CharsetDetector
* This class makes inteligent wrapper around iconv library. It can
* guess charset of given text using static analyse. Default behavior
* is highly extensible - you can load configuration for different
* charsets or different languages. And you can do it dynamically.
* @note Design pattern @b facade.
*/
class CharsetDetector {
const DEFAULT_TARGET_CHARSET = "UTF-8"; ///< Default target charset.\ Can be overriden on the fly.
const DEFAULT_MAX_ANALYZED_LENGTH = 65536; ///< Default value for maxLengthAnalyzed parameter of analyze method.
const DEFAULT_MIN_RELEVANCE = 0.1; ///< Default value for minRlevance property.
/**
* Intelligent "equivalent" of file_get_contents standard PHP function.
* @note This is the most simple (and most dummy) way to use this class.
* @param filename Name of file to read (and recode if needed).
* @return Return Text of the file using @b default target charset.
*/
public static function file_get_contents($filename) {
return self::convert(file_get_contents($filename));
}
/**
* Convert string to desired charset if detected charset is @e enough relevant.
* If there is no charset detected (or not relevant), it just return the original input.
* @param stringToConvert String to convert - input of this method.
* @param targetCharset Charset to convert to. Using iconv's mark-up.
* @return Return converted string if charset was detected, or original in other case.
*/
public static function convert($stringToConvert, $targetCharset = self::DEFAULT_TARGET_CHARSET) {
$instance = new self;
$instance->setTargetCharset($targetCharset);
$instance->analyze($stringToConvert);
return $instance->convertIfRelevant($stringToConvert);
}
/**
* @param charsetStreamAnalyzer Initialized instace of analyzer.
* Use CharsetStreamAnalyzerFactory class to create such object, or
* omit this parameter to use default analyzer.
*/
public function __construct($charsetStreamAnalyzer = null) {
$this->analyzer = (null == $charsetStreamAnalyzer)
? CharsetStreamAnalyzerFactory::createDefault()
: $charsetStreamAnalyzer;
$this->minRelevance = self::DEFAULT_MIN_RELEVANCE;
$this->targetCharset = self::DEFAULT_TARGET_CHARSET;
}
/**
* @return Return instance of CharsetStreamAnalyzer
* currently being used by CharsetDetector.
*/
public function getCharsetStreamAnalyzer() {
return $this->analyzer;
}
/**
* Analyze a piece of given text and try to detect charset.
* @note This method can be called periodically.
* @param stringToAnalyze Text to use for analyze.
* @param maxLengthAnalyzed Maximum length of text used for analyze.
*/
public function analyze($stringToAnalyze, $maxLengthAnalyzed = self::DEFAULT_MAX_ANALYZED_LENGTH) {
$length = strlen($stringToAnalyze);
if ($length > $maxLengthAnalyzed)
// Limit analyzed length
$length = $maxLengthAnalyzed;
for($i=0; $i< $length; $i++) {
$char = $stringToAnalyze[$i];
if (0x80 <= ord($char))
// Non-ASCII character
$this->analyzer->putchar($char);
}
}
/**
* Convert given text to target charset if detected charset is @e enough relevant.
* @note This method can be called periodically.
* @param stringToConvert Text to convert - input of this method.
* @return Return converted text to target charset, or original
* if no relevant charset was detected.
*/
public function convertIfRelevant($stringToConvert) {
$weightMap = $this->analyzer->getCharsetWeightMap();
$iterator = $weightMap->createSortedIterator();
$item = $iterator->next();
if (null == $item)
return $stringToConvert;
$relWeight = $item->getRelWeight();
$charset = $item->getCharset();
if ($relWeight < $this->minRelevance)
return $stringToConvert;
else
return CharsetConverter::convertCharset($charset, $this->targetCharset, $stringToConvert);
}
/**
* @return Return target charset.
*/
public function getTargetCharset() {
return $this->targetCharset;
}
/**
* Set target charset.
* @param targetCharset Target charset to set to.
*/
public function setTargetCharset($targetCharset) {
$this->targetCharset = $targetCharset;
}
/**
* See documentation of CharsetDetector::setMinRelevance method.
* @return Return minimal relevance.
*/
public function getMinRelevance() {
return $this->minRelevance;
}
/**
* Set minimal relevance.\ Relevance is normalized to interval <-1, 1>.\
* 1 means sure detected, -1 means sure displaced and 0 means don't know.\
* @note This parameter should be always greater then zero.
* @param minRelevance Minimal relevance to set to.
*/
public function setMinRelevance($minRelevance) {
$this->minRelevance = $minRelevance;
}
private $analyzer;
private $minRelevance;
private $targetCharset;
};
/**
* Dummy wrapper around iconv library.
* @ingroup CharsetDetector
* Created in hope it can increase independency on iconv library.
*/
class CharsetConverter {
/**
* Dummy wrapper around standard iconv method.
* Refer to manual of this standard method.
*/
public static function convertCharset($sourceEncoding, $targetEncoding, $stringToConvert) {
return iconv($sourceEncoding, $targetEncoding, $stringToConvert);
}
}
/**
* Input stream interface.
* @ingroup CharsetDetector
* Now supporting only one-char transfer.
*/
interface IInputStream {
/**
* Send character to stream.
* @param charToPut Character to send to stream.
*/
public function putchar($charToPut);
};
/**
* Interface of extensible character weight maps.
* @ingroup CharsetDetector
*/
interface ICharWeightMapList {
/**
* Add weight map for new charset or overwrite an existing one.
* @param charsetName Name of charset to define.
* @param charWeightMap Weight map's data. Format is @e character @e => @e weight.
*/
public function addWeightMap($charsetName, $charWeightMap);
};
/**
* This class analyzes text to detect its charset.
* @ingroup CharsetDetector
*/
class CharsetStreamAnalyzer implements
IInputStream,
ICharWeightMapList
{
public function __construct() {
$this->charsetWeightMap = new CharsetWeightMap;
}
/**
* Analyze one character.
* @copydoc IInputStream::putchar
*/
public function putchar($charToPut) {
$charOrdValue = ord($charToPut);
foreach($this->weightMapList as $charset => $weightMap) {
if (array_key_exists($charOrdValue, $weightMap))
$this->charsetWeightMap->addWeight($charset, $weightMap[$charOrdValue]);
// Useful for debug
// printf("%x - %d<br/>\n", $charOrdValue, $weightMap[$charOrdValue]);
}
}
/**
* Add weight map for new charset or overwrite an existing one.
* @copydoc ICharWeightMapList::addWeightMap
*/
public function addWeightMap($charsetName, $charWeightMap) {
$this->weightMapList[$charsetName] = $charWeightMap;
$this->charsetWeightMap->addWeight($charsetName, 0);
}
/**
* This method return results of analyse.
* @return Return CharsetWeightMap object containg results.
*/
public function getCharsetWeightMap() {
return clone $this->charsetWeightMap;
}
private $weightMapList = Array();
private $charsetWeightMap;
};
/**
* Charset analyse's data container.
* @ingroup CharsetDetector
*/
class CharsetWeightMap {
/**
* Increment counter of selected charset.
* @param charset Charset name to increment.
* @param weight Weight of increment - can positive or negative integer.
*/
public function addWeight($charset, $weight) {
if (!isset($this->map[$charset]))
// avoid a warning 'Notice: Undefined index' with log level E_NOTICE
$this->map[$charset] = 0;
$this->map[$charset] += $weight;
if (0 < $weight)
$this->totalWeight += $weight;
}
/**
* Create desceding ordered iterator for items maintained by container.
* @return Return initialized object, which implements
* ICharsetWeightMapIterator interface.
*/
public function createSortedIterator() {
$iterator = new CharsetWeightMapIterator;
if ($this->totalWeight > 0) {
$sortedMap = $this->map;
arsort($sortedMap);
foreach($sortedMap as $charset => $weight) {
$item = new CharsetWeightMapItem($charset, $weight, $weight/$this->totalWeight);
// Useful for debug
// printf("%s - %.2f %%<br/>\n", $charset, $weight/$this->totalWeight*100);
$iterator->add($item);
}
}
return $iterator;
}
private $map = Array();
private $totalWeight = 0;
};
/**
* Interface of charset weight map iterator.
* @ingroup CharsetDetector
* @note Design pattern @b iterator.
*/
interface ICharsetWeightMapIterator {
/**
* Return current item and then move to next.
* @return Return current item or null if there is no current item.
*/
public function next();
};
/**
* Realization of ICharsetWeightMapIterator used in CharsetWeightMap class.
* @ingroup CharsetDetector
* @attention This class was @b not @b tested.
*/
class CharsetWeightMapIterator implements ICharsetWeightMapIterator {
/**
* @copydoc ICharWeightMapList::next
* @attention This class was @b not @b tested.
*/
public function next() {
if ($this->index >= count($this->vector))
return null;
else
return $this->vector[$this->index++];
}
/**
* Add item to iterator. Usually called by creator.
* @param charsetWeightMapItem Item to add.
*/
public function add($charsetWeightMapItem) {
$this->vector[]= $charsetWeightMapItem;
}
private $vector = Array();
private $index = 0;
};
/**
* Class containing data of charset weight map item.
* @ingroup CharsetDetector
* Objects of this class are non-mutable. It just holds data.
*/
class CharsetWeightMapItem {
/**
* @param charset Charset name.
* @param absWeight Absolute weight of charset.
* @param relWeight Relative weight of charset.\ Using interval <-1, 1>.
*/
public function __construct($charset, $absWeight, $relWeight) {
$this->charset = $charset;
$this->absWeight = $absWeight;
$this->relWeight = $relWeight;
}
/**
* @return Return charset name.
*/
public function getCharset() {
return $this->charset;
}
/**
* @return Return absolute weight of charset.
*/
public function getAbsWeight() {
return $this->absWeight;
}
/**
* @return Return relative weight of charset.\ Using interval <-1, 1>.
*/
public function getRelWeight() {
return $this->relWeight;
}
private $charset;
private $absWeight;
private $relWeight;
};
/**
* Factory for CharsetStreamAnalyzer objects.
* @ingroup CharsetDetector
*/
class CharsetStreamAnalyzerFactory {
/**
* @return Return non-configured CharsetStreamAnalyzer object.
* @note You need to load configuration yourself.
*/
public static function createEmpty() {
return new CharsetStreamAnalyzer;
}
/**
* @return Return CharsetStreamAnalyzer object with default
* configuration loaded.
* @note You can extend its configuration yourself.
* @note Consider this method's code while creating new extensions.
*/
public static function createDefault() {
$analyzer = self::createEmpty();
// ISO-8859-2
$charWeightMap = Array();
for($i=0x80; $i<=0x9F; $i++)
// Invalid character
$charWeightMap[$i] = -5;
for($i=0xA0; $i<=0xFF; $i++)
// Default weight
$charWeightMap[$i] = -1;
// Positive weight
$charWeightMap[0xA9] = $charWeightMap[0xAB] = $charWeightMap[0xAE] = 1;
$charWeightMap[0xB9] = $charWeightMap[0xBB] = $charWeightMap[0xBE] = 1;
$charWeightMap[0xC1] = $charWeightMap[0xC8] = $charWeightMap[0xC9] = 1;
$charWeightMap[0xCC] = $charWeightMap[0xCD] = $charWeightMap[0xCF] = 1;
$charWeightMap[0xD2] = $charWeightMap[0xD3] = $charWeightMap[0xD8] = 1;
$charWeightMap[0xD9] = $charWeightMap[0xDA] = $charWeightMap[0xDD] = 1;
$charWeightMap[0xE1] = $charWeightMap[0xE8] = $charWeightMap[0xE9] = 1;
$charWeightMap[0xEC] = $charWeightMap[0xED] = $charWeightMap[0xEF] = 1;
$charWeightMap[0xF2] = $charWeightMap[0xF3] = $charWeightMap[0xF8] = 1;
$charWeightMap[0xF9] = $charWeightMap[0xFA] = $charWeightMap[0xFD] = 1;
$analyzer->addWeightMap("ISO-8859-2", $charWeightMap);
// Windows 1250
$charWeightMap = Array();
for($i=0x80; $i<=0xFF; $i++)
// Default weight
$charWeightMap[$i] = -1;
// Invalid characters
$charWeightMap[0x81] = $charWeightMap[0x83] = $charWeightMap[0x88] = -5;
$charWeightMap[0x90] = $charWeightMap[0x98] = -5;
// Positive weight
$charWeightMap[0x8A] = $charWeightMap[0x8D] = $charWeightMap[0x8E] = 1;
$charWeightMap[0x9A] = $charWeightMap[0x9D] = $charWeightMap[0x9E] = 1;
$charWeightMap[0xC1] = $charWeightMap[0xC8] = $charWeightMap[0xC9] = 1;
$charWeightMap[0xCC] = $charWeightMap[0xCD] = $charWeightMap[0xCF] = 1;
$charWeightMap[0xD2] = $charWeightMap[0xD3] = $charWeightMap[0xD8] = 1;
$charWeightMap[0xD9] = $charWeightMap[0xDA] = $charWeightMap[0xDD] = 1;
$charWeightMap[0xE1] = $charWeightMap[0xE5] = $charWeightMap[0xE8] = 1;
$charWeightMap[0xE9] = $charWeightMap[0xEC] = $charWeightMap[0xED] = 1;
$charWeightMap[0xEF] = $charWeightMap[0xF2] = $charWeightMap[0xF3] = 1;
$charWeightMap[0xF8] = $charWeightMap[0xFA] = $charWeightMap[0xFD] = 1;
$analyzer->addWeightMap("CP1250", $charWeightMap);
return $analyzer;
}
};
?>