lib/CharsetDetector.class.php

Go to the documentation of this file.
00001 <?php
00027 class CharsetDetector {
00028   const DEFAULT_TARGET_CHARSET = "UTF-8";     
00029   const DEFAULT_MAX_ANALYZED_LENGTH = 65536;  
00030   const DEFAULT_MIN_RELEVANCE = 0.1;          
00031 
00037   public static function file_get_contents($filename) {
00038     return self::convert(file_get_contents($filename));
00039   }
00047   public static function convert($stringToConvert, $targetCharset = self::DEFAULT_TARGET_CHARSET) {
00048     $instance = new self;
00049     $instance->setTargetCharset($targetCharset);
00050     $instance->analyze($stringToConvert);
00051     return $instance->convertIfRelevant($stringToConvert);
00052   }
00058   public function __construct($charsetStreamAnalyzer = null) {
00059     if (null==$charsetStreamAnalyzer)
00060       $this->analyzer = CharsetStreamAnalyzerFactory::createDefault();
00061     $this->minRelevance = self::DEFAULT_MIN_RELEVANCE;
00062     $this->targetCharset = self::DEFAULT_TARGET_CHARSET;
00063   }
00068   public function getCharsetStreamAnalyzer() {
00069     return $this->analyzer;
00070   }
00077   public function analyze($stringToAnalyze, $maxLengthAnalyzed = self::DEFAULT_MAX_ANALYZED_LENGTH) {
00078     $length = strlen($stringToAnalyze);
00079     if ($length > $maxLengthAnalyzed)
00080       // Limit analyzed length
00081       $length = $maxLengthAnalyzed;
00082       
00083       
00084     for($i=0; $i< $length; $i++) {
00085       $char = $stringToAnalyze[$i];
00086       if (0x80 <= ord($char))
00087         // Non-ASCII character
00088         $this->analyzer->putchar($char);
00089     }
00090   }
00098   public function convertIfRelevant($stringToConvert) {
00099     $weightMap = $this->analyzer->getCharsetWeightMap();
00100     $iterator = $weightMap->createSortedIterator();
00101     $item = $iterator->next();
00102     if (null == $item)
00103       return $stringToConvert;
00104     $relWeight = $item->getRelWeight();
00105     $charset = $item->getCharset();
00106     if ($relWeight < $this->minRelevance)
00107       return $stringToConvert;
00108     else
00109       return CharsetConverter::convertCharset($charset, $this->targetCharset, $stringToConvert);
00110   }
00114   public function getTargetCharset() {
00115     return $this->targetCharset;
00116   }
00121   public function setTargetCharset($targetCharset) {
00122     $this->targetCharset = $targetCharset;
00123   }
00128   public function getMinRelevance() {
00129     return $this->minRelevance;
00130   }
00137   public function setMinRelevance($minRelevance) {
00138     $this->minRelevance = $minRelevance;
00139   }
00140   private $analyzer;
00141   private $minRelevance;
00142   private $targetCharset;
00143 };
00144 
00150 class CharsetConverter {
00155   public static function convertCharset($sourceEncoding, $targetEncoding, $stringToConvert) {
00156     return iconv($sourceEncoding, $targetEncoding, $stringToConvert);
00157   }
00158 }
00159 
00165 interface IInputStream {
00170   public function putchar($charToPut);
00171 };
00172 
00177 interface ICharWeightMapList {
00183   public function addWeightMap($charsetName, $charWeightMap);
00184 };
00185 
00190 class CharsetStreamAnalyzer implements
00191     IInputStream,
00192     ICharWeightMapList
00193 {
00194   public function __construct() {
00195     $this->charsetWeightMap = new CharsetWeightMap;
00196   }
00201   public function putchar($charToPut) {
00202     $charOrdValue = ord($charToPut);
00203     foreach($this->weightMapList as $charset => $weightMap) {
00204       if (array_key_exists($charOrdValue, $weightMap))
00205         $this->charsetWeightMap->addWeight($charset, $weightMap[$charOrdValue]);
00206       // Useful for debug
00207       // printf("%x - %d<br/>\n", $charOrdValue, $weightMap[$charOrdValue]);
00208     }
00209   }
00214   public function addWeightMap($charsetName, $charWeightMap) {
00215     $this->weightMapList[$charsetName] = $charWeightMap;
00216     $this->charsetWeightMap->addWeight($charsetName, 0);
00217   }
00222   public function getCharsetWeightMap() {
00223     return clone $this->charsetWeightMap;
00224   }
00225   private $weightMapList = Array();
00226   private $charsetWeightMap;
00227 };
00228 
00233 class CharsetWeightMap {
00239   public function addWeight($charset, $weight) {
00240     $this->map[$charset] += $weight;
00241     $this->totalWeight += $weight;
00242   }
00248   public function createSortedIterator() {
00249     $iterator = new CharsetWeightMapIterator;
00250     if ($this->totalWeight > 0) {
00251       $sortedMap = $this->map;
00252       arsort($sortedMap);
00253       foreach($sortedMap as $charset => $weight) {
00254         $item = new CharsetWeightMapItem($charset, $weight, $weight/$this->totalWeight);
00255         // Useful for debug
00256         // printf("%s - %.2f %%<br/>\n", $charset, $weight/$this->totalWeight*100);
00257         $iterator->add($item);
00258       }
00259     }
00260     return $iterator;
00261   }
00262   private $map = Array();
00263   private $totalWeight;
00264 };
00265 
00271 interface ICharsetWeightMapIterator {
00276   public function next();
00277 };
00278 
00284 class CharsetWeightMapIterator implements ICharsetWeightMapIterator {
00289   public function next() {
00290     if ($this->index >= count($this->vector))
00291       return null;
00292     else
00293       return $this->vector[$this->index++];
00294   }
00299   public function add($charsetWeightMapItem) {
00300     $this->vector[]= $charsetWeightMapItem;
00301   }
00302   private $vector = Array();
00303   private $index = 0;
00304 };
00305 
00311 class CharsetWeightMapItem {
00317   public function __construct($charset, $absWeight, $relWeight) {
00318     $this->charset = $charset;
00319     $this->absWeight = $absWeight;
00320     $this->relWeight = $relWeight;
00321   }
00325   public function getCharset() {
00326     return $this->charset;
00327   }
00331   public function getAbsWeight() {
00332     return $this->absWeight;
00333   }
00337   public function getRelWeight() {
00338     return $this->relWeight;
00339   }
00340   private $charset;
00341   private $absWeight;
00342   private $relWeight;
00343 };
00344 
00349 class CharsetStreamAnalyzerFactory {
00354   public static function createEmpty() {
00355     return new CharsetStreamAnalyzer;
00356   }
00363   public static function createDefault() {
00364     $analyzer = self::createEmpty();
00365     
00366     // ISO-8859-2
00367     $charWeightMap = Array();
00368     for($i=0x80; $i<=0x9F; $i++)
00369       // Invalid character
00370       $charWeightMap[$i] = -5;
00371     for($i=0xA0; $i<=0xFF; $i++)
00372       // Default weight
00373       $charWeightMap[$i] = -1;
00374     // Positive weight
00375     $charWeightMap[0xA9] = $charWeightMap[0xAB] = $charWeightMap[0xAE] = 1;
00376     $charWeightMap[0xB9] = $charWeightMap[0xBB] = $charWeightMap[0xBE] = 1;
00377     $charWeightMap[0xC1] = $charWeightMap[0xC8] = $charWeightMap[0xC9] = 1;
00378     $charWeightMap[0xCC] = $charWeightMap[0xCD] = $charWeightMap[0xCF] = 1;
00379     $charWeightMap[0xD2] = $charWeightMap[0xD3] = $charWeightMap[0xD8] = 1;
00380     $charWeightMap[0xD9] = $charWeightMap[0xDA] = $charWeightMap[0xDD] = 1;
00381     $charWeightMap[0xE1] = $charWeightMap[0xE8] = $charWeightMap[0xE9] = 1;
00382     $charWeightMap[0xEC] = $charWeightMap[0xED] = $charWeightMap[0xEF] = 1;
00383     $charWeightMap[0xF2] = $charWeightMap[0xF3] = $charWeightMap[0xF8] = 1;
00384     $charWeightMap[0xF9] = $charWeightMap[0xFA] = $charWeightMap[0xFD] = 1;
00385     $analyzer->addWeightMap("ISO-8859-2", $charWeightMap);
00386     
00387     // Windows 1250
00388     $charWeightMap = Array();
00389     for($i=0x80; $i<=0xFF; $i++)
00390       // Default weight
00391       $charWeightMap[$i] = -1;
00392     // Invalid characters
00393     $charWeightMap[0x81] = $charWeightMap[0x83] = $charWeightMap[0x88] = -5;
00394     $charWeightMap[0x90] = $charWeightMap[0x98] = -5;
00395     // Positive weight
00396     $charWeightMap[0x8A] = $charWeightMap[0x8D] = $charWeightMap[0x8E] = 1;
00397     $charWeightMap[0x9A] = $charWeightMap[0x9D] = $charWeightMap[0x9E] = 1;
00398     $charWeightMap[0xC1] = $charWeightMap[0xC8] = $charWeightMap[0xC9] = 1;
00399     $charWeightMap[0xCC] = $charWeightMap[0xCD] = $charWeightMap[0xCF] = 1;
00400     $charWeightMap[0xD2] = $charWeightMap[0xD3] = $charWeightMap[0xD8] = 1;
00401     $charWeightMap[0xD9] = $charWeightMap[0xDA] = $charWeightMap[0xDD] = 1;
00402     $charWeightMap[0xE1] = $charWeightMap[0xE5] = $charWeightMap[0xE8] = 1;
00403     $charWeightMap[0xE9] = $charWeightMap[0xEC] = $charWeightMap[0xED] = 1;
00404     $charWeightMap[0xEF] = $charWeightMap[0xF2] = $charWeightMap[0xF3] = 1;
00405     $charWeightMap[0xF8] = $charWeightMap[0xFA] = $charWeightMap[0xFD] = 1;
00406     $analyzer->addWeightMap("CP1250", $charWeightMap);
00407     
00408     return $analyzer;
00409   }
00410 };
00411 
00412 ?>

Generated on Sat Mar 8 10:26:43 2008 for Dudka.cz by  doxygen 1.5.4