Changeset 5224
- Timestamp:
- 10/04/06 09:03:57 (2 years ago)
- Files:
-
- trunk/activesupport/CHANGELOG (modified) (1 diff)
- trunk/activesupport/lib/active_support/multibyte/generators/generate_tables.rb (modified) (4 diffs)
- trunk/activesupport/lib/active_support/multibyte/handlers/utf8_handler.rb (modified) (4 diffs)
- trunk/activesupport/lib/active_support/values/unicode_tables-1.8.4.dat (deleted)
- trunk/activesupport/lib/active_support/values/unicode_tables-1.8.5.dat (deleted)
- trunk/activesupport/lib/active_support/values/unicode_tables.dat (added)
- trunk/activesupport/test/multibyte_chars_test.rb (modified) (1 diff)
- trunk/activesupport/test/multibyte_handler_test.rb (modified) (1 diff)
Legend:
- Unmodified
- Added
- Removed
- Modified
- Copied
- Moved
trunk/activesupport/CHANGELOG
r5223 r5224 1 1 *SVN* 2 2 3 * Add ActiveSupport::Multibyte. Provides String#chars which lets you deal with strings as a sequence of chars, not of bytes. Closes #6242 [Julian Tarkhanov, Manfred Stienstra & Jan Behrens] 3 * Pull in latest multibye patch. Closes #6346 [Manfred Stienstra] 4 5 * Add ActiveSupport::Multibyte. Provides String#chars which lets you deal with strings as a sequence of chars, not of bytes. Closes #6242 [Julian Tarkhanov, Manfred Stienstra, Thijs van der Vossen & Jan Behrens] 4 6 5 7 * Fix issue with #class_inheritable_accessor saving updates to the parent class when initialized with an Array or Hash [mojombo] trunk/activesupport/lib/active_support/multibyte/generators/generate_tables.rb
r5223 r5224 19 19 :codepoints => BASE_URI + 'UnicodeData.txt', 20 20 :composition_exclusion => BASE_URI + 'CompositionExclusions.txt', 21 :grapheme_break_property => BASE_URI + 'auxiliary/GraphemeBreakProperty.txt' 21 :grapheme_break_property => BASE_URI + 'auxiliary/GraphemeBreakProperty.txt', 22 :cp1252 => 'http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1252.TXT' 22 23 } 23 24 … … 34 35 @ucd.composition_map = {} 35 36 @ucd.boundary = {} 37 @ucd.cp1252 = {} 36 38 end 37 39 … … 88 90 end 89 91 92 def parse_cp1252(line) 93 if line =~ /^([0-9A-Fx]+)\s([0-9A-Fx]+)/i 94 @ucd.cp1252[$1.hex] = $2.hex 95 end 96 end 97 90 98 def create_composition_map 91 99 @ucd.codepoints.each do |_, cp| … … 126 134 def dump_to(filename) 127 135 File.open(filename, 'wb') do |f| 128 f.write Marshal.dump([@ucd.codepoints, @ucd.composition_exclusion, @ucd.composition_map, @ucd.boundary ])136 f.write Marshal.dump([@ucd.codepoints, @ucd.composition_exclusion, @ucd.composition_map, @ucd.boundary, @ucd.cp1252]) 129 137 end 130 138 end trunk/activesupport/lib/active_support/multibyte/handlers/utf8_handler.rb
r5223 r5224 8 8 9 9 class UnicodeDatabase #:nodoc: 10 attr_accessor :codepoints, :composition_exclusion, :composition_map, :boundary 10 attr_accessor :codepoints, :composition_exclusion, :composition_map, :boundary, :cp1252 11 11 12 12 # Creates a new UnicodeDatabase instance and loads the database. 13 13 def initialize 14 14 begin 15 @codepoints, @composition_exclusion, @composition_map, @boundary = self.class.load15 @codepoints, @composition_exclusion, @composition_map, @boundary, @cp1252 = self.class.load 16 16 rescue Exception => e 17 17 raise IOError.new("Couldn't load the unicode tables for UTF8Handler (#{e.message}), handler is unusable") … … 21 21 @composition_map ||= {} 22 22 @boundary ||= {} 23 @cp1252 ||= {} 23 24 24 25 # Redefine the === method so we can write shorter rules for grapheme cluster breaks … … 42 43 # Returns the filename for the data file for this version 43 44 def self.filename 44 File.expand_path File.join(dirname, "unicode_tables -#{VERSION}.dat")45 File.expand_path File.join(dirname, "unicode_tables.dat") 45 46 end 46 47 47 48 # Loads the unicode database and returns all the internal objects of UnicodeDatabase 48 49 def self.load 49 begin50 return load_file(filename)51 rescue Exception52 # If we can't load our own version, try the rest53 Dir["#{dirname}/*.dat"].sort.each do |dat|54 begin55 return load_file(dat)56 rescue Exception57 end58 end59 end60 raise IOError.new("Can't load a marshal file for your version of Ruby")61 end62 63 def self.load_file(filename)64 50 File.open(self.filename, 'rb') { |f| Marshal.load f.read } 65 51 end … … 276 262 # Strips all the non-utf-8 bytes from the string resulting in a valid utf-8 string 277 263 def tidy_bytes(str) 278 str.split(//u).reject { |c| !UTF8_PAT.match(c) }.join 264 str.unpack('C*').map { |n| 265 n < 128 ? n.chr : 266 n < 160 ? [UCD.cp1252[n] || n].pack('U') : 267 n < 192 ? "\xC2" + n.chr : "\xC3" + (n-64).chr 268 }.join 279 269 end 280 270 trunk/activesupport/test/multibyte_chars_test.rb
r5223 r5224 140 140 def test_resilience 141 141 assert_nothing_raised do 142 assert_equal 1, @s[:bytes].chars.size, "There's only one valid utf-8 byte in the string"142 assert_equal 5, @s[:bytes].chars.size, "The sequence contains five interpretable bytes" 143 143 end 144 reversed = [0xb8, 0x17e, 0x8, 0x2c6, 0xa5].reverse.pack('U*') 144 145 assert_nothing_raised do 145 assert_equal "\010", @s[:bytes].chars.reverse, "There's only one valid utf-8 byte in the string"146 assert_equal reversed, @s[:bytes].chars.reverse.to_s, "Reversing the string should only yield interpretable bytes" 146 147 end 147 148 assert_nothing_raised do 148 149 @s[:bytes].chars.reverse! 149 assert_equal "\010", @s[:bytes], "There's only one valid utf-8 byte in the string"150 assert_equal reversed, @s[:bytes].to_s, "Reversing the string should only yield interpretable bytes" 150 151 end 151 152 end trunk/activesupport/test/multibyte_handler_test.rb
r5223 r5224 225 225 226 226 def test_tidy_bytes 227 assert_equal "\010", @handler.tidy_bytes(@bytestring) 228 assert_equal "a\010a", @handler.tidy_bytes('a' + @bytestring + 'a') 227 result = [0xb8, 0x17e, 0x8, 0x2c6, 0xa5].pack('U*') 228 assert_equal result, @handler.tidy_bytes(@bytestring) 229 assert_equal "a#{result}a", @handler.tidy_bytes('a' + @bytestring + 'a') 229 230 assert_nothing_raised { @handler.tidy_bytes(@bytestring).unpack('U*') } 231 232 assert_equal "\xC3\xA7", @handler.tidy_bytes("\xE7") # iso_8859_1: small c cedilla 233 assert_equal "\xC2\xA9", @handler.tidy_bytes("\xA9") # iso_8859_1: copyright symbol 234 assert_equal "\xE2\x80\x9C", @handler.tidy_bytes("\x93") # win_1252: left smart quote 235 assert_equal "\xE2\x82\xAC", @handler.tidy_bytes("\x80") # win_1252: euro 236 assert_equal "\x00", @handler.tidy_bytes("\x00") # null char 237 assert_equal [0xef, 0xbf, 0xbd].pack('U*'), @handler.tidy_bytes("\xef\xbf\xbd") # invalid char 230 238 end 231 239