#!/usr/bin/perl
use Compress::Zlib;
use Text::ParseWords;
use Jcode;
$from = 'A-Za-z0-9a-z';
$to = 'A-ZA-Z0-9A-Z';
opendir(DIR, "./data") or die;
@files = sort readdir(DIR);
closedir(DIR);
foreach (@files) {
if (/G*.csv/) {
$fname = $_;
$pname = "./data/" . $_;
print $fname, "\n";
open(FILE, $pname) or die;
print "\t", tell(FILE), "\n";
while (<FILE>) {
@line = quotewords(",", 0, $_);
$line[3] = Jcode->new($line[3])->tr($from, $to);
$line[4] = Jcode->new($line[4])->tr($from, $to);
@result = `echo \'$line[3] $line[4]\' | chasen -F '%m\n'`;
foreach (@result) {
if (!/^EOS$/) {
print "\t\t", crc32($_), "\n";
}
}
print "\t", tell, "\n";
}
close(FILE);
}
}
インデックスファイルの作成 #include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define FILENAMELEN 7
#define BUFFERLEN 1024
#define TOCFILE "toc.dat"
#define TABLEFILE "table.dat"
typedef struct entry {
unsigned long key;
char filename[FILENAMELEN + 1];
long location;
} ENTRY;
typedef struct index {
int count;
int size;
ENTRY *entries;
} INDEX;
ENTRY *entries;
void realloc_entry(INDEX *index)
{
if (index->size == 0) {
index->size = 100;
index->entries = (ENTRY *)malloc(sizeof(ENTRY) * index->size);
}
else {
ENTRY *new_entries;
index->size *= 2;
new_entries = (ENTRY *)malloc(sizeof(ENTRY) * index->size);
memcpy(new_entries, index->entries, sizeof(ENTRY) * index->size);
free(index->entries);
index->entries = new_entries;
}
if (index->entries == NULL) {
printf("Memory overflow\n");
exit(1);
}
}
void add_entry(INDEX *index, unsigned long key, char *filename, long location)
{
if (index->count == index->size) {
realloc_entry(index);
}
index->entries[index->count].key = key;
strncpy(index->entries[index->count].filename, filename, FILENAMELEN);
index->entries[index->count].filename[FILENAMELEN] = '\0';
index->entries[index->count].location = location;
index->count++;
}
int cmpentry(const void *key1, const void *key2)
{
ENTRY *e1 = (ENTRY *)key1;
ENTRY *e2 = (ENTRY *)key2;
if (e1->key > e2->key) {
return 1;
}
else if (e1->key < e2->key) {
return -1;
}
else {
return 0;
}
}
void write_index(INDEX *index)
{
FILE *fp_toc;
FILE *fp_table;
unsigned long current_key = -1;
int i;
char sentinel_fname[FILENAMELEN + 1];
long sentinel_location = 0;
memset(sentinel_fname, '\0', sizeof(sentinel_fname));
if ((fp_toc = fopen(TOCFILE, "w")) == NULL) {
printf("can't open %s\n", TOCFILE);
exit(1);
}
if ((fp_table = fopen(TABLEFILE, "w")) == NULL) {
printf("can't open %s\n", TABLEFILE);
exit(1);
}
for (i = 0; i < index->count; i++) {
if (index->entries[i].key != current_key) {
long location;
if (i > 0) {
fwrite(sentinel_fname, sizeof(sentinel_fname), 1, fp_table);
fwrite(&sentinel_location, sizeof(long), 1, fp_table);
}
current_key = index->entries[i].key;
location = ftell(fp_table);
fwrite(&index->entries[i].key, sizeof(unsigned long), 1, fp_toc);
fwrite(&location, sizeof(long), 1, fp_toc);
}
fwrite(index->entries[i].filename, sizeof(index->entries[i].filename), 1
, fp_table);
fwrite(&index->entries[i].location, sizeof(long), 1, fp_table);
}
fclose(fp_table);
fclose(fp_toc);
}
int main(int argc, char* argv[])
{
char buffer[BUFFERLEN];
INDEX index;
char fname[FILENAMELEN + 1];
long location;
long key;
int i;
memset(buffer, '\0', BUFFERLEN);
index.size = 0;
index.count = 0;
while (fgets(buffer, sizeof(buffer), stdin) != NULL) {
if (buffer[0] != '\t') { // This line means filename.
if (buffer[strlen(buffer) - 1] == '\n') {
buffer[strlen(buffer) - 1] = '\0';
}
strncpy(fname, buffer, FILENAMELEN);
fname[FILENAMELEN] = '\0';
printf("%s\n", fname);
}
else if (buffer[1] != '\t') {
location = strtoul(&buffer[1], NULL, 0);
}
else {
key = strtoul(&buffer[2], NULL, 0);
add_entry(&index, key, fname, location);
}
}
qsort(index.entries, index.count, sizeof(ENTRY), cmpentry);
write_index(&index);
}
単語の検索 #include <stdio.h>
#include <stdlib.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <unistd.h>
#include <fcntl.h>
#define BUFFERLEN 1024
#define FILENAMELEN 7
#define MAXKEY 100
#define TOCFILE "toc.dat"
#define TABLEFILE "table.dat"
typedef struct entry {
unsigned long key;
long location;
} ENTRY;
typedef struct location {
char filename[FILENAMELEN + 1];
long location;
} LOCATION;
ENTRY *toc_entries;
int read_toc()
{
struct stat statbuf;
int toc_size;
int fd;
int i;
if ((fd = open(TOCFILE, O_RDONLY)) == -1) {
exit(1);
}
if (fstat(fd, &statbuf) == -1) {
exit(1);
}
toc_size = statbuf.st_size / 8;
toc_entries = (ENTRY *)malloc(sizeof(ENTRY) * toc_size);
for (i = 0; i < toc_size; i++) {
if (read(fd, &toc_entries[i].key, sizeof(unsigned long)) != sizeof(unsig
ned long)) {
exit(1);
}
if (read(fd, &toc_entries[i].location, sizeof(long)) != sizeof(long)) {
exit(1);
}
}
close(fd);
return toc_size;
}
int cmpentry(const void *key1, const void *key2)
{
ENTRY *e1 = (ENTRY *)key1;
ENTRY *e2 = (ENTRY *)key2;
if (e1->key < e2->key) {
return -1;
}
else if (e1->key > e2->key) {
return 1;
}
else {
return 0;
}
}
void read_location(int fd, LOCATION *loc)
{
int count;
count = read(fd, loc->filename, 8);
if (count != 8) {
exit(2);
}
count = read(fd, &loc->location, 4);
if (count != 4) {
exit(3);
}
}
int cmplocation(const void *key1, const void *key2)
{
LOCATION *loc1 = (LOCATION *)key1;
LOCATION *loc2 = (LOCATION *)key2;
int strcmpresult;
if ((strcmpresult = strcmp(loc1->filename, loc2->filename)) == 0) {
if (loc1->location > loc2->location) {
return 1;
}
else if (loc1->location < loc2->location) {
return -1;
}
else {
return 0;
}
}
else {
return strcmpresult;
}
}
int normalize_set(LOCATION *set, int count)
{
int i;
int j;
int new_count = count;
qsort(set, count, sizeof(LOCATION), cmplocation);
for (i = 0; i < count - 1; i++) {
j = i + 1;
while (cmplocation(&set[i], &set[j]) == 0) {
j++;
}
if (j > i + 1) {
memcpy(&set[i + 1], &set[j], count - j);
new_count -= j - i - 1;
}
}
return new_count;
}
void copy_location(LOCATION *dest, LOCATION *src)
{
strcpy(dest->filename, src->filename);
dest->location = src->location;
}
int main(int argc, char *argv[])
{
long keys[MAXKEY];
int num_of_keys = 0;
LOCATION *lset1 = NULL;
LOCATION *lset2 = NULL;
int nlset;
int nlset1;
int nlset2;
char buffer[BUFFERLEN];
int toc_size;
int fd;
int i;
int j;
if ((fd = open(TABLEFILE, O_RDONLY)) == -1) {
return 1;
}
toc_size = read_toc();
while (fgets(buffer, sizeof(buffer), stdin) != NULL) {
ENTRY search_entry;
ENTRY *found_entry;
ENTRY *next_entry;
int cur;
int found_key;
search_entry.key = strtoul(buffer, NULL, 10);
found_key = 0;
for (i = 0; i < num_of_keys; i++) {
if (keys[i] == search_entry.key) {
found_key = 1;
break;
}
}
if (found_key) {
continue;
}
if (num_of_keys == MAXKEY) {
break;
}
keys[num_of_keys++] = search_entry.key;
found_entry = (ENTRY *)bsearch(&search_entry,
toc_entries,
toc_size,
sizeof(ENTRY),
cmpentry);
if (found_entry == NULL) {
return 0;
}
next_entry = found_entry + 1;
nlset = (next_entry->location - found_entry->location) / sizeof(LOCATION
) - 1;
lseek(fd, found_entry->location, SEEK_SET);
if (lset1 == NULL) {
lset1 = (LOCATION *)malloc(sizeof(LOCATION) * nlset);
for (i = 0; i < nlset; i++) {
read_location(fd, &lset1[i]);
}
nlset1 = normalize_set(lset1, nlset);
continue;
}
lset2 = (LOCATION *)malloc(sizeof(LOCATION) * nlset);
for (i = 0; i < nlset; i++) {
read_location(fd, &lset2[i]);
}
nlset2 = normalize_set(lset2, nlset);
cur = 0;
for (j = 0; j < nlset2; j++) {
for (i = cur; i < nlset1; i++) {
if (cmplocation(&lset1[i], &lset2[j]) == 0) {
if (i != cur) {
copy_location(&lset1[cur], &lset2[j]);
}
cur++;
break;
}
}
}
nlset1 = cur;
}
for (i = 0; i < nlset1; i++) {
printf("%s %d\n", lset1[i].filename, lset1[i].location);
}
return 0;
}
ユーザインターフェース(抜粋) function print_result($k, $group) {
$descriptorspec = array(
0 => array("pipe", "r"),
1 => array("pipe", "w"),
2 => array("file", "./error.log", "a")
);
$k = mb_strtoupper(mb_convert_kana($k, "KVA"));
$pipes = array();
$process = proc_open("./chasen -F '%m\n'", $descriptorspec, $pipes);
if (is_resource($process)) {
fwrite($pipes[0], $k . "\n");
fclose($pipes[0]);
$output = array();
while(!feof($pipes[1])) {
array_push($output, fgets($pipes[1], 1024));
}
fclose($pipes[1]);
$return_value = proc_close($process);
array_pop($output); // remove 'EOS'
array_pop($output); // remove '\n'
}
$result = array();
$process = proc_open("./search", $descriptorspec, $pipes);
if (is_resource($process)) {
foreach ($output as $hash) {
fwrite($pipes[0], crc32($hash) . "\n");
}
fclose($pipes[0]);
while(!feof($pipes[1])) {
$line = fgets($pipes[1], 1024);
$entry = preg_split('/ /', $line);
if (count($entry) == 2) {
if (($fp = @fopen("./data/" . $entry[0], "r")) != FALSE) {
fseek($fp, $entry[1]);
$data = fgetcsv($fp, 512);
if (count($data) == 5) {
array_push($result, $data);
}
fclose($fp);
}
}
}
fclose($pipes[1]);
$return_value = proc_close($process);
}
}
![]()

![]()