00001 <?php
00027 class CharsetDetector {
00028 const DEFAULT_TARGET_CHARSET = "UTF-8";
00029 const DEFAULT_MAX_ANALYZED_LENGTH = 65536;
00030 const DEFAULT_MIN_RELEVANCE = 0.1;
00031
00037 public static function file_get_contents($filename) {
00038 return self::convert(file_get_contents($filename));
00039 }
00047 public static function convert($stringToConvert, $targetCharset = self::DEFAULT_TARGET_CHARSET) {
00048 $instance = new self;
00049 $instance->setTargetCharset($targetCharset);
00050 $instance->analyze($stringToConvert);
00051 return $instance->convertIfRelevant($stringToConvert);
00052 }
00058 public function __construct($charsetStreamAnalyzer = null) {
00059 if (null==$charsetStreamAnalyzer)
00060 $this->analyzer = CharsetStreamAnalyzerFactory::createDefault();
00061 $this->minRelevance = self::DEFAULT_MIN_RELEVANCE;
00062 $this->targetCharset = self::DEFAULT_TARGET_CHARSET;
00063 }
00068 public function getCharsetStreamAnalyzer() {
00069 return $this->analyzer;
00070 }
00077 public function analyze($stringToAnalyze, $maxLengthAnalyzed = self::DEFAULT_MAX_ANALYZED_LENGTH) {
00078 $length = strlen($stringToAnalyze);
00079 if ($length > $maxLengthAnalyzed)
00080
00081 $length = $maxLengthAnalyzed;
00082
00083
00084 for($i=0; $i< $length; $i++) {
00085 $char = $stringToAnalyze[$i];
00086 if (0x80 <= ord($char))
00087
00088 $this->analyzer->putchar($char);
00089 }
00090 }
00098 public function convertIfRelevant($stringToConvert) {
00099 $weightMap = $this->analyzer->getCharsetWeightMap();
00100 $iterator = $weightMap->createSortedIterator();
00101 $item = $iterator->next();
00102 if (null == $item)
00103 return $stringToConvert;
00104 $relWeight = $item->getRelWeight();
00105 $charset = $item->getCharset();
00106 if ($relWeight < $this->minRelevance)
00107 return $stringToConvert;
00108 else
00109 return CharsetConverter::convertCharset($charset, $this->targetCharset, $stringToConvert);
00110 }
00114 public function getTargetCharset() {
00115 return $this->targetCharset;
00116 }
00121 public function setTargetCharset($targetCharset) {
00122 $this->targetCharset = $targetCharset;
00123 }
00128 public function getMinRelevance() {
00129 return $this->minRelevance;
00130 }
00137 public function setMinRelevance($minRelevance) {
00138 $this->minRelevance = $minRelevance;
00139 }
00140 private $analyzer;
00141 private $minRelevance;
00142 private $targetCharset;
00143 };
00144
00150 class CharsetConverter {
00155 public static function convertCharset($sourceEncoding, $targetEncoding, $stringToConvert) {
00156 return iconv($sourceEncoding, $targetEncoding, $stringToConvert);
00157 }
00158 }
00159
00165 interface IInputStream {
00170 public function putchar($charToPut);
00171 };
00172
00177 interface ICharWeightMapList {
00183 public function addWeightMap($charsetName, $charWeightMap);
00184 };
00185
00190 class CharsetStreamAnalyzer implements
00191 IInputStream,
00192 ICharWeightMapList
00193 {
00194 public function __construct() {
00195 $this->charsetWeightMap = new CharsetWeightMap;
00196 }
00201 public function putchar($charToPut) {
00202 $charOrdValue = ord($charToPut);
00203 foreach($this->weightMapList as $charset => $weightMap) {
00204 if (array_key_exists($charOrdValue, $weightMap))
00205 $this->charsetWeightMap->addWeight($charset, $weightMap[$charOrdValue]);
00206
00207
00208 }
00209 }
00214 public function addWeightMap($charsetName, $charWeightMap) {
00215 $this->weightMapList[$charsetName] = $charWeightMap;
00216 $this->charsetWeightMap->addWeight($charsetName, 0);
00217 }
00222 public function getCharsetWeightMap() {
00223 return clone $this->charsetWeightMap;
00224 }
00225 private $weightMapList = Array();
00226 private $charsetWeightMap;
00227 };
00228
00233 class CharsetWeightMap {
00239 public function addWeight($charset, $weight) {
00240 $this->map[$charset] += $weight;
00241 $this->totalWeight += $weight;
00242 }
00248 public function createSortedIterator() {
00249 $iterator = new CharsetWeightMapIterator;
00250 if ($this->totalWeight > 0) {
00251 $sortedMap = $this->map;
00252 arsort($sortedMap);
00253 foreach($sortedMap as $charset => $weight) {
00254 $item = new CharsetWeightMapItem($charset, $weight, $weight/$this->totalWeight);
00255
00256
00257 $iterator->add($item);
00258 }
00259 }
00260 return $iterator;
00261 }
00262 private $map = Array();
00263 private $totalWeight;
00264 };
00265
00271 interface ICharsetWeightMapIterator {
00276 public function next();
00277 };
00278
00284 class CharsetWeightMapIterator implements ICharsetWeightMapIterator {
00289 public function next() {
00290 if ($this->index >= count($this->vector))
00291 return null;
00292 else
00293 return $this->vector[$this->index++];
00294 }
00299 public function add($charsetWeightMapItem) {
00300 $this->vector[]= $charsetWeightMapItem;
00301 }
00302 private $vector = Array();
00303 private $index = 0;
00304 };
00305
00311 class CharsetWeightMapItem {
00317 public function __construct($charset, $absWeight, $relWeight) {
00318 $this->charset = $charset;
00319 $this->absWeight = $absWeight;
00320 $this->relWeight = $relWeight;
00321 }
00325 public function getCharset() {
00326 return $this->charset;
00327 }
00331 public function getAbsWeight() {
00332 return $this->absWeight;
00333 }
00337 public function getRelWeight() {
00338 return $this->relWeight;
00339 }
00340 private $charset;
00341 private $absWeight;
00342 private $relWeight;
00343 };
00344
00349 class CharsetStreamAnalyzerFactory {
00354 public static function createEmpty() {
00355 return new CharsetStreamAnalyzer;
00356 }
00363 public static function createDefault() {
00364 $analyzer = self::createEmpty();
00365
00366
00367 $charWeightMap = Array();
00368 for($i=0x80; $i<=0x9F; $i++)
00369
00370 $charWeightMap[$i] = -5;
00371 for($i=0xA0; $i<=0xFF; $i++)
00372
00373 $charWeightMap[$i] = -1;
00374
00375 $charWeightMap[0xA9] = $charWeightMap[0xAB] = $charWeightMap[0xAE] = 1;
00376 $charWeightMap[0xB9] = $charWeightMap[0xBB] = $charWeightMap[0xBE] = 1;
00377 $charWeightMap[0xC1] = $charWeightMap[0xC8] = $charWeightMap[0xC9] = 1;
00378 $charWeightMap[0xCC] = $charWeightMap[0xCD] = $charWeightMap[0xCF] = 1;
00379 $charWeightMap[0xD2] = $charWeightMap[0xD3] = $charWeightMap[0xD8] = 1;
00380 $charWeightMap[0xD9] = $charWeightMap[0xDA] = $charWeightMap[0xDD] = 1;
00381 $charWeightMap[0xE1] = $charWeightMap[0xE8] = $charWeightMap[0xE9] = 1;
00382 $charWeightMap[0xEC] = $charWeightMap[0xED] = $charWeightMap[0xEF] = 1;
00383 $charWeightMap[0xF2] = $charWeightMap[0xF3] = $charWeightMap[0xF8] = 1;
00384 $charWeightMap[0xF9] = $charWeightMap[0xFA] = $charWeightMap[0xFD] = 1;
00385 $analyzer->addWeightMap("ISO-8859-2", $charWeightMap);
00386
00387
00388 $charWeightMap = Array();
00389 for($i=0x80; $i<=0xFF; $i++)
00390
00391 $charWeightMap[$i] = -1;
00392
00393 $charWeightMap[0x81] = $charWeightMap[0x83] = $charWeightMap[0x88] = -5;
00394 $charWeightMap[0x90] = $charWeightMap[0x98] = -5;
00395
00396 $charWeightMap[0x8A] = $charWeightMap[0x8D] = $charWeightMap[0x8E] = 1;
00397 $charWeightMap[0x9A] = $charWeightMap[0x9D] = $charWeightMap[0x9E] = 1;
00398 $charWeightMap[0xC1] = $charWeightMap[0xC8] = $charWeightMap[0xC9] = 1;
00399 $charWeightMap[0xCC] = $charWeightMap[0xCD] = $charWeightMap[0xCF] = 1;
00400 $charWeightMap[0xD2] = $charWeightMap[0xD3] = $charWeightMap[0xD8] = 1;
00401 $charWeightMap[0xD9] = $charWeightMap[0xDA] = $charWeightMap[0xDD] = 1;
00402 $charWeightMap[0xE1] = $charWeightMap[0xE5] = $charWeightMap[0xE8] = 1;
00403 $charWeightMap[0xE9] = $charWeightMap[0xEC] = $charWeightMap[0xED] = 1;
00404 $charWeightMap[0xEF] = $charWeightMap[0xF2] = $charWeightMap[0xF3] = 1;
00405 $charWeightMap[0xF8] = $charWeightMap[0xFA] = $charWeightMap[0xFD] = 1;
00406 $analyzer->addWeightMap("CP1250", $charWeightMap);
00407
00408 return $analyzer;
00409 }
00410 };
00411
00412 ?>