#!/usr/bin/perl use Compress::Zlib; use Text::ParseWords; use Jcode; $from = 'A-Za-z0-9a-z'; $to = 'A-ZA-Z0-9A-Z'; opendir(DIR, "./data") or die; @files = sort readdir(DIR); closedir(DIR); foreach (@files) { if (/G*.csv/) { $fname = $_; $pname = "./data/" . $_; print $fname, "\n"; open(FILE, $pname) or die; print "\t", tell(FILE), "\n"; while (<FILE>) { @line = quotewords(",", 0, $_); $line[3] = Jcode->new($line[3])->tr($from, $to); $line[4] = Jcode->new($line[4])->tr($from, $to); @result = `echo \'$line[3] $line[4]\' | chasen -F '%m\n'`; foreach (@result) { if (!/^EOS$/) { print "\t\t", crc32($_), "\n"; } } print "\t", tell, "\n"; } close(FILE); } }インデックスファイルの作成
#include <stdio.h> #include <stdlib.h> #include <string.h> #define FILENAMELEN 7 #define BUFFERLEN 1024 #define TOCFILE "toc.dat" #define TABLEFILE "table.dat" typedef struct entry { unsigned long key; char filename[FILENAMELEN + 1]; long location; } ENTRY; typedef struct index { int count; int size; ENTRY *entries; } INDEX; ENTRY *entries; void realloc_entry(INDEX *index) { if (index->size == 0) { index->size = 100; index->entries = (ENTRY *)malloc(sizeof(ENTRY) * index->size); } else { ENTRY *new_entries; index->size *= 2; new_entries = (ENTRY *)malloc(sizeof(ENTRY) * index->size); memcpy(new_entries, index->entries, sizeof(ENTRY) * index->size); free(index->entries); index->entries = new_entries; } if (index->entries == NULL) { printf("Memory overflow\n"); exit(1); } } void add_entry(INDEX *index, unsigned long key, char *filename, long location) { if (index->count == index->size) { realloc_entry(index); } index->entries[index->count].key = key; strncpy(index->entries[index->count].filename, filename, FILENAMELEN); index->entries[index->count].filename[FILENAMELEN] = '\0'; index->entries[index->count].location = location; index->count++; } int cmpentry(const void *key1, const void *key2) { ENTRY *e1 = (ENTRY *)key1; ENTRY *e2 = (ENTRY *)key2; if (e1->key > e2->key) { return 1; } else if (e1->key < e2->key) { return -1; } else { return 0; } } void write_index(INDEX *index) { FILE *fp_toc; FILE *fp_table; unsigned long current_key = -1; int i; char sentinel_fname[FILENAMELEN + 1]; long sentinel_location = 0; memset(sentinel_fname, '\0', sizeof(sentinel_fname)); if ((fp_toc = fopen(TOCFILE, "w")) == NULL) { printf("can't open %s\n", TOCFILE); exit(1); } if ((fp_table = fopen(TABLEFILE, "w")) == NULL) { printf("can't open %s\n", TABLEFILE); exit(1); } for (i = 0; i < index->count; i++) { if (index->entries[i].key != current_key) { long location; if (i > 0) { fwrite(sentinel_fname, sizeof(sentinel_fname), 1, fp_table); fwrite(&sentinel_location, sizeof(long), 1, fp_table); } current_key = index->entries[i].key; location = ftell(fp_table); fwrite(&index->entries[i].key, sizeof(unsigned long), 1, fp_toc); fwrite(&location, sizeof(long), 1, fp_toc); } fwrite(index->entries[i].filename, sizeof(index->entries[i].filename), 1 , fp_table); fwrite(&index->entries[i].location, sizeof(long), 1, fp_table); } fclose(fp_table); fclose(fp_toc); } int main(int argc, char* argv[]) { char buffer[BUFFERLEN]; INDEX index; char fname[FILENAMELEN + 1]; long location; long key; int i; memset(buffer, '\0', BUFFERLEN); index.size = 0; index.count = 0; while (fgets(buffer, sizeof(buffer), stdin) != NULL) { if (buffer[0] != '\t') { // This line means filename. if (buffer[strlen(buffer) - 1] == '\n') { buffer[strlen(buffer) - 1] = '\0'; } strncpy(fname, buffer, FILENAMELEN); fname[FILENAMELEN] = '\0'; printf("%s\n", fname); } else if (buffer[1] != '\t') { location = strtoul(&buffer[1], NULL, 0); } else { key = strtoul(&buffer[2], NULL, 0); add_entry(&index, key, fname, location); } } qsort(index.entries, index.count, sizeof(ENTRY), cmpentry); write_index(&index); }単語の検索
#include <stdio.h> #include <stdlib.h> #include <sys/types.h> #include <sys/stat.h> #include <unistd.h> #include <fcntl.h> #define BUFFERLEN 1024 #define FILENAMELEN 7 #define MAXKEY 100 #define TOCFILE "toc.dat" #define TABLEFILE "table.dat" typedef struct entry { unsigned long key; long location; } ENTRY; typedef struct location { char filename[FILENAMELEN + 1]; long location; } LOCATION; ENTRY *toc_entries; int read_toc() { struct stat statbuf; int toc_size; int fd; int i; if ((fd = open(TOCFILE, O_RDONLY)) == -1) { exit(1); } if (fstat(fd, &statbuf) == -1) { exit(1); } toc_size = statbuf.st_size / 8; toc_entries = (ENTRY *)malloc(sizeof(ENTRY) * toc_size); for (i = 0; i < toc_size; i++) { if (read(fd, &toc_entries[i].key, sizeof(unsigned long)) != sizeof(unsig ned long)) { exit(1); } if (read(fd, &toc_entries[i].location, sizeof(long)) != sizeof(long)) { exit(1); } } close(fd); return toc_size; } int cmpentry(const void *key1, const void *key2) { ENTRY *e1 = (ENTRY *)key1; ENTRY *e2 = (ENTRY *)key2; if (e1->key < e2->key) { return -1; } else if (e1->key > e2->key) { return 1; } else { return 0; } } void read_location(int fd, LOCATION *loc) { int count; count = read(fd, loc->filename, 8); if (count != 8) { exit(2); } count = read(fd, &loc->location, 4); if (count != 4) { exit(3); } } int cmplocation(const void *key1, const void *key2) { LOCATION *loc1 = (LOCATION *)key1; LOCATION *loc2 = (LOCATION *)key2; int strcmpresult; if ((strcmpresult = strcmp(loc1->filename, loc2->filename)) == 0) { if (loc1->location > loc2->location) { return 1; } else if (loc1->location < loc2->location) { return -1; } else { return 0; } } else { return strcmpresult; } } int normalize_set(LOCATION *set, int count) { int i; int j; int new_count = count; qsort(set, count, sizeof(LOCATION), cmplocation); for (i = 0; i < count - 1; i++) { j = i + 1; while (cmplocation(&set[i], &set[j]) == 0) { j++; } if (j > i + 1) { memcpy(&set[i + 1], &set[j], count - j); new_count -= j - i - 1; } } return new_count; } void copy_location(LOCATION *dest, LOCATION *src) { strcpy(dest->filename, src->filename); dest->location = src->location; } int main(int argc, char *argv[]) { long keys[MAXKEY]; int num_of_keys = 0; LOCATION *lset1 = NULL; LOCATION *lset2 = NULL; int nlset; int nlset1; int nlset2; char buffer[BUFFERLEN]; int toc_size; int fd; int i; int j; if ((fd = open(TABLEFILE, O_RDONLY)) == -1) { return 1; } toc_size = read_toc(); while (fgets(buffer, sizeof(buffer), stdin) != NULL) { ENTRY search_entry; ENTRY *found_entry; ENTRY *next_entry; int cur; int found_key; search_entry.key = strtoul(buffer, NULL, 10); found_key = 0; for (i = 0; i < num_of_keys; i++) { if (keys[i] == search_entry.key) { found_key = 1; break; } } if (found_key) { continue; } if (num_of_keys == MAXKEY) { break; } keys[num_of_keys++] = search_entry.key; found_entry = (ENTRY *)bsearch(&search_entry, toc_entries, toc_size, sizeof(ENTRY), cmpentry); if (found_entry == NULL) { return 0; } next_entry = found_entry + 1; nlset = (next_entry->location - found_entry->location) / sizeof(LOCATION ) - 1; lseek(fd, found_entry->location, SEEK_SET); if (lset1 == NULL) { lset1 = (LOCATION *)malloc(sizeof(LOCATION) * nlset); for (i = 0; i < nlset; i++) { read_location(fd, &lset1[i]); } nlset1 = normalize_set(lset1, nlset); continue; } lset2 = (LOCATION *)malloc(sizeof(LOCATION) * nlset); for (i = 0; i < nlset; i++) { read_location(fd, &lset2[i]); } nlset2 = normalize_set(lset2, nlset); cur = 0; for (j = 0; j < nlset2; j++) { for (i = cur; i < nlset1; i++) { if (cmplocation(&lset1[i], &lset2[j]) == 0) { if (i != cur) { copy_location(&lset1[cur], &lset2[j]); } cur++; break; } } } nlset1 = cur; } for (i = 0; i < nlset1; i++) { printf("%s %d\n", lset1[i].filename, lset1[i].location); } return 0; }ユーザインターフェース(抜粋)
function print_result($k, $group) { $descriptorspec = array( 0 => array("pipe", "r"), 1 => array("pipe", "w"), 2 => array("file", "./error.log", "a") ); $k = mb_strtoupper(mb_convert_kana($k, "KVA")); $pipes = array(); $process = proc_open("./chasen -F '%m\n'", $descriptorspec, $pipes); if (is_resource($process)) { fwrite($pipes[0], $k . "\n"); fclose($pipes[0]); $output = array(); while(!feof($pipes[1])) { array_push($output, fgets($pipes[1], 1024)); } fclose($pipes[1]); $return_value = proc_close($process); array_pop($output); // remove 'EOS' array_pop($output); // remove '\n' } $result = array(); $process = proc_open("./search", $descriptorspec, $pipes); if (is_resource($process)) { foreach ($output as $hash) { fwrite($pipes[0], crc32($hash) . "\n"); } fclose($pipes[0]); while(!feof($pipes[1])) { $line = fgets($pipes[1], 1024); $entry = preg_split('/ /', $line); if (count($entry) == 2) { if (($fp = @fopen("./data/" . $entry[0], "r")) != FALSE) { fseek($fp, $entry[1]); $data = fgetcsv($fp, 512); if (count($data) == 5) { array_push($result, $data); } fclose($fp); } } } fclose($pipes[1]); $return_value = proc_close($process); } }