-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathrabin.pl
More file actions
127 lines (107 loc) · 3.32 KB
/
rabin.pl
File metadata and controls
127 lines (107 loc) · 3.32 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
#! perl
use strict;
use warnings;
BEGIN {
eval "use Smart::Comments;";
}
use Algorithm::RabinKarp;
use Algorithm::RabinKarp::Util qw(filter_regexp stream_fh);
use File::Find::Rule;
use IO::Handle;
use Fcntl qw(:seek);
my @rec;
my %occurances;
my $rule = File::Find::Rule->new;
my @files = $rule->or($rule->new
->directory
->name(qr/(blib6?|inc)/)
->prune
->discard,
$rule->new->file->name(shift @ARGV))->in(@ARGV);
for my $file ( @files ) { ### Hashing ===[%] done
open my $fh, '<', $file;
my $kgram = Algorithm::RabinKarp->new( 80, filter_regexp( qr/[\s;#]/, stream_fh($fh) ) );
# Create hash data for each kgram inside the document
while (my ($hash, @pos) = $kgram->next) {
$occurances{$hash}++;
push @rec, [$hash, $file, @pos];
}
close $fh;
}
use constant HASH => 0;
use constant FILE => 1;
use constant HASH_START => 2;
use constant HASH_END => 3;
use constant KGRAMS => 4;
@rec = sort { $a->[FILE] cmp $b->[FILE]
or $a->[HASH_START] <=> $b->[HASH_START] }
grep{ $occurances{ $_->[HASH] } > 1 } # at least one appearance.
@rec;
# If the previous line has the same count and same file, and
# the start of the next line is before the end of the previous line
# merge them together.
my @newrec = shift @rec; # enforce invariant that there is always a previous
# element
for my $curr (@rec) { ### Joining ranges ===[%] done
my $prev = $newrec[-1] or die "Something evil has happened";
if ($occurances{ $curr->[HASH] } == $occurances{ $prev->[HASH] }
&& $curr->[FILE] eq $prev->[FILE]
&& $curr->[HASH_START] <= $prev->[HASH_END]
) {
$prev->[HASH_END] = $curr->[HASH_END];
} else {
push @newrec, $curr;
}
}
my %chunks;
my $last = '';
sub dumpit {
for my $text (keys %chunks) {
my @files = keys %{ $chunks{$text} };
next unless @files > 1;
print "====\n";
for my $file ( @files) {
for my $pos (@{ $chunks{$text}{$file} }) {
print " $file lines ".$pos->[0][-1].':'.$pos->[1][-1]."\n";
}
}
print ">>>>\n$text\n----\n";
}
%chunks = ();
}
for my $rec (sort {
$occurances{ $b->[HASH] } <=> $occurances{ $a->[HASH] }
or $a->[HASH] <=> $b->[HASH]
or span($b) <=> span($a)
} @newrec) { ### Emitting Report ===[%] done
my ($hash, $file_name, $start_offset, $end_offset, $s, $e) = @$rec;
dumpit() if ($last ne $hash);
$last = $hash;
push @{ $chunks{emit_fragment($file_name,$start_offset,$end_offset)}{$file_name}}, [$s,$e];
}
dumpit();
sub span {
my $rec = shift;
$rec->[HASH_END] - $rec->[HASH_START] + 1
}
use Fcntl qw(SEEK_SET);
sub emit_fragment {
my ($file, $start, $end) = @_;
open(my $fh, '<', $file)
or die "Can't open $file: $!";
my $bytes = $end - $start + 1;
my $buf;
seek($fh, $start, SEEK_SET);
read $fh, $buf, $bytes;
close $fh;
return $buf;
}
sub hash {
my $val = shift;
return {
COUNT => $occurances{ $val->[HASH] },
FILE => $val->[FILE],
HASH_START => $val->[HASH_START],
HASH_END => $val->[HASH_END],
};
}