forked from len0rd/rockbox
		
	Please test :) it should convert dict format dictionarys to rockboxformat. git-svn-id: svn://svn.rockbox.org/rockbox/trunk@6621 a1c6a512-1295-4272-9138-f99709370657
		
			
				
	
	
		
			83 lines
		
	
	
	
		
			2.9 KiB
		
	
	
	
		
			Perl
		
	
	
		
			Executable file
		
	
	
	
	
			
		
		
	
	
			83 lines
		
	
	
	
		
			2.9 KiB
		
	
	
	
		
			Perl
		
	
	
		
			Executable file
		
	
	
	
	
| #!/usr/bin/perl
 | |
| 
 | |
| #             __________               __   ___.
 | |
| #   Open      \______   \ ____   ____ |  | _\_ |__   _______  ___
 | |
| #   Source     |       _//  _ \_/ ___\|  |/ /| __ \ /  _ \  \/  /
 | |
| #   Jukebox    |    |   (  <_> )  \___|    < | \_\ (  <_> > <  <
 | |
| #   Firmware   |____|_  /\____/ \___  >__|_ \|___  /\____/__/\_ \
 | |
| #                     \/            \/     \/    \/            \/
 | |
| # $Id$
 | |
| #
 | |
| # Copyright (C) 2005 Tony Motakis
 | |
| #
 | |
| # All files in this archive are subject to the GNU General Public License.
 | |
| # See the file COPYING in the source tree root for full license agreement.
 | |
| #
 | |
| # This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
 | |
| # KIND, either express or implied.
 | |
| 
 | |
| # set the word size limit
 | |
| $word_limit = 32;
 | |
| 
 | |
| use Compress::Zlib;
 | |
| 
 | |
| # generate base 64 convertion hash
 | |
| @b64_values = (	'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L',
 | |
| 		'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
 | |
| 		'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j',
 | |
| 		'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
 | |
| 		'w', 'x', 'y', 'z', '0', 1, 2, 3, 4, 5, 6, 7, 8, 9, '+', '/' );
 | |
| 
 | |
| foreach (0..63) {
 | |
| 	$b64_get_value{$b64_values[$_]} = $_;
 | |
| }
 | |
| 
 | |
| # base 64 convertion subroutine. note that if input is plain (base 64) 0, perl
 | |
| # doesn't like it, and the function misinterprents it as a (decimal) 0
 | |
| # while it actually is a (decimal) 52. Input has a tab in front anyway, so
 | |
| # this bug actually doesn't matter
 | |
| sub base64 {
 | |
| 	my $i = 1, $num = 0, $left = $_[0];
 | |
| 	while($left) {
 | |
| 		$left =~ m{([^\s])$};		# use last char of string
 | |
| 		chop $left;			# yes, chop, NOT chomp
 | |
| 		$num += $i * $b64_get_value{$1};
 | |
| 		$i *= 64;
 | |
| 	}
 | |
| 	$num;
 | |
| }
 | |
| 
 | |
| # Open input files. <INDEX> is the database index, and $DICT is the actuall
 | |
| # dictionary file we want to access (note the use of zlib, hence the $DICT
 | |
| # variable instead of a <DICT> filehandle). <RDFOUT> is the output file, in
 | |
| # plain rockbox dictionary format
 | |
| open INDEX, $ARGV[0] or die "Could not open index: $!";
 | |
| $DICT = gzopen($ARGV[1], "rb") or die "Could not open definitions file: $!";
 | |
| open RDFOUT, ">$ARGV[2]" or die "Could not open output file: $!";
 | |
| 
 | |
| # Read the index
 | |
| while(<INDEX>)
 | |
| {
 | |
| 	next if /^00-?database/;
 | |
| 
 | |
| 	my @current = split /\t|\n/;			# split in pieces
 | |
| 	$current[0] =~ s/^\s(.{1,$word_limit}).*$/\L\1/;	# lowercase
 | |
| 	push @def_list, $current[0];
 | |
| 	$def_begin{$current[0]} = base64($current[1]);
 | |
| 	$def_length{$current[0]} = base64($current[2]);
 | |
| }
 | |
| 
 | |
| # sort the definition list. input from the <INDEX> is usualy sorted, but this
 | |
| # is not mandatory in the dict file format, so we can't rely on this
 | |
| @def_list = sort @def_list;
 | |
| 
 | |
| # read the whole DICT file into memory. overkill? propably. but the file is
 | |
| # compressed, and we need quick access to random parts of it
 | |
| $def_all .= $_ while($DICT->gzread($_));
 | |
| 
 | |
| foreach (@def_list) {
 | |
| 	$def = substr $def_all, $def_begin{$_}, $def_length{$_};
 | |
| 	$def =~ s/\n\s*/ /g;	# remove newlines and whitespace after them
 | |
| 	print RDFOUT $_ . "\t" . $def . "\n";
 | |
| }
 | |
| 
 |