Ruby on Rails | Screencasts | Download | Documentation | Weblog | Community | Source

Changeset 5224

Show
Ignore:
Timestamp:
10/04/06 09:03:57 (2 years ago)
Author:
nzkoz
Message:

Pull in latest multibyte patch. Closes #6346 [Manfred Stienstra]

Files:

Legend:

Unmodified
Added
Removed
Modified
Copied
Moved
  • trunk/activesupport/CHANGELOG

    r5223 r5224  
    11*SVN* 
    22 
    3 * Add ActiveSupport::Multibyte.  Provides String#chars which lets you deal with strings as a sequence of chars, not of bytes. Closes #6242 [Julian Tarkhanov, Manfred Stienstra & Jan Behrens] 
     3* Pull in latest multibye patch. Closes #6346 [Manfred Stienstra] 
     4 
     5* Add ActiveSupport::Multibyte.  Provides String#chars which lets you deal with strings as a sequence of chars, not of bytes. Closes #6242 [Julian Tarkhanov, Manfred Stienstra, Thijs van der Vossen & Jan Behrens] 
    46 
    57* Fix issue with #class_inheritable_accessor saving updates to the parent class when initialized with an Array or Hash [mojombo] 
  • trunk/activesupport/lib/active_support/multibyte/generators/generate_tables.rb

    r5223 r5224  
    1919      :codepoints => BASE_URI + 'UnicodeData.txt', 
    2020      :composition_exclusion => BASE_URI + 'CompositionExclusions.txt', 
    21       :grapheme_break_property => BASE_URI + 'auxiliary/GraphemeBreakProperty.txt' 
     21      :grapheme_break_property => BASE_URI + 'auxiliary/GraphemeBreakProperty.txt', 
     22      :cp1252 => 'http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1252.TXT' 
    2223    } 
    2324     
     
    3435      @ucd.composition_map = {} 
    3536      @ucd.boundary = {} 
     37      @ucd.cp1252 = {} 
    3638    end 
    3739     
     
    8890    end 
    8991     
     92    def parse_cp1252(line) 
     93      if line =~ /^([0-9A-Fx]+)\s([0-9A-Fx]+)/i 
     94        @ucd.cp1252[$1.hex] = $2.hex 
     95      end 
     96    end 
     97     
    9098    def create_composition_map 
    9199      @ucd.codepoints.each do |_, cp| 
     
    126134    def dump_to(filename) 
    127135      File.open(filename, 'wb') do |f| 
    128         f.write Marshal.dump([@ucd.codepoints, @ucd.composition_exclusion, @ucd.composition_map, @ucd.boundary]) 
     136        f.write Marshal.dump([@ucd.codepoints, @ucd.composition_exclusion, @ucd.composition_map, @ucd.boundary, @ucd.cp1252]) 
    129137      end 
    130138    end 
  • trunk/activesupport/lib/active_support/multibyte/handlers/utf8_handler.rb

    r5223 r5224  
    88   
    99  class UnicodeDatabase #:nodoc: 
    10     attr_accessor :codepoints, :composition_exclusion, :composition_map, :boundary 
     10    attr_accessor :codepoints, :composition_exclusion, :composition_map, :boundary, :cp1252 
    1111     
    1212    # Creates a new UnicodeDatabase instance and loads the database. 
    1313    def initialize 
    1414      begin 
    15         @codepoints, @composition_exclusion, @composition_map, @boundary = self.class.load 
     15        @codepoints, @composition_exclusion, @composition_map, @boundary, @cp1252 = self.class.load 
    1616      rescue Exception => e 
    1717          raise IOError.new("Couldn't load the unicode tables for UTF8Handler (#{e.message}), handler is unusable") 
     
    2121      @composition_map ||= {} 
    2222      @boundary ||= {} 
     23      @cp1252 ||= {} 
    2324       
    2425      # Redefine the === method so we can write shorter rules for grapheme cluster breaks 
     
    4243    # Returns the filename for the data file for this version 
    4344    def self.filename 
    44       File.expand_path File.join(dirname, "unicode_tables-#{VERSION}.dat") 
     45      File.expand_path File.join(dirname, "unicode_tables.dat") 
    4546    end 
    4647     
    4748    # Loads the unicode database and returns all the internal objects of UnicodeDatabase 
    4849    def self.load 
    49       begin 
    50         return load_file(filename) 
    51       rescue Exception 
    52         # If we can't load our own version, try the rest 
    53         Dir["#{dirname}/*.dat"].sort.each do |dat| 
    54           begin 
    55             return load_file(dat) 
    56           rescue Exception 
    57           end 
    58         end 
    59       end 
    60       raise IOError.new("Can't load a marshal file for your version of Ruby") 
    61     end 
    62      
    63     def self.load_file(filename) 
    6450      File.open(self.filename, 'rb') { |f| Marshal.load f.read } 
    6551    end 
     
    276262      # Strips all the non-utf-8 bytes from the string resulting in a valid utf-8 string 
    277263      def tidy_bytes(str) 
    278         str.split(//u).reject { |c| !UTF8_PAT.match(c) }.join 
     264        str.unpack('C*').map { |n| 
     265          n < 128 ? n.chr : 
     266          n < 160 ? [UCD.cp1252[n] || n].pack('U') : 
     267          n < 192 ? "\xC2" + n.chr : "\xC3" + (n-64).chr 
     268        }.join 
    279269      end 
    280270       
  • trunk/activesupport/test/multibyte_chars_test.rb

    r5223 r5224  
    140140  def test_resilience 
    141141    assert_nothing_raised do 
    142       assert_equal 1, @s[:bytes].chars.size, "There's only one valid utf-8 byte in the string
     142      assert_equal 5, @s[:bytes].chars.size, "The sequence contains five interpretable bytes
    143143    end 
     144    reversed = [0xb8, 0x17e, 0x8, 0x2c6, 0xa5].reverse.pack('U*') 
    144145    assert_nothing_raised do 
    145       assert_equal "\010", @s[:bytes].chars.reverse, "There's only one valid utf-8 byte in the string
     146      assert_equal reversed, @s[:bytes].chars.reverse.to_s, "Reversing the string should only yield interpretable bytes
    146147    end 
    147148    assert_nothing_raised do 
    148149      @s[:bytes].chars.reverse! 
    149       assert_equal "\010", @s[:bytes], "There's only one valid utf-8 byte in the string
     150      assert_equal reversed, @s[:bytes].to_s, "Reversing the string should only yield interpretable bytes
    150151    end 
    151152  end 
  • trunk/activesupport/test/multibyte_handler_test.rb

    r5223 r5224  
    225225   
    226226  def test_tidy_bytes 
    227     assert_equal "\010", @handler.tidy_bytes(@bytestring) 
    228     assert_equal "a\010a", @handler.tidy_bytes('a' + @bytestring + 'a') 
     227    result = [0xb8, 0x17e, 0x8, 0x2c6, 0xa5].pack('U*') 
     228    assert_equal result, @handler.tidy_bytes(@bytestring) 
     229    assert_equal "a#{result}a", @handler.tidy_bytes('a' + @bytestring + 'a') 
    229230    assert_nothing_raised { @handler.tidy_bytes(@bytestring).unpack('U*') } 
     231     
     232    assert_equal "\xC3\xA7", @handler.tidy_bytes("\xE7") # iso_8859_1: small c cedilla 
     233    assert_equal "\xC2\xA9", @handler.tidy_bytes("\xA9") # iso_8859_1: copyright symbol 
     234    assert_equal "\xE2\x80\x9C", @handler.tidy_bytes("\x93") # win_1252: left smart quote 
     235    assert_equal "\xE2\x82\xAC", @handler.tidy_bytes("\x80") # win_1252: euro 
     236    assert_equal "\x00", @handler.tidy_bytes("\x00") # null char 
     237    assert_equal [0xef, 0xbf, 0xbd].pack('U*'), @handler.tidy_bytes("\xef\xbf\xbd") # invalid char 
    230238  end 
    231239