[UVA][SA後綴數組] 10234 - Frequent Substrings
Problem F: Frequent Substrings |
You are working for the R&D department of International Embedded Equipments Enterprise (IEEE). One day, your boss calls you up and tells this:
"Yes, I think we
should use the N-Gram Analysis for the project. What do you think ??"
"Ye...s, I also thought so...", you replied.
After the conversation, having no clue about what he said, you start searching the internet and find the following sentences.
N-Grams are substrings of length N. N-Gram Analysis is a method of matching text, based on the statistical similarity of occurrences of N-Grams. N-Gram Analysis is used in research areas such as speech recognition, database interfacing, network communication etc.
After going through the topic a few times and reading several articles, you figure out the main parts of the project you have to accomplish. One of the parts required you to do this: Given a string S, you have to find the most frequently occurring N-Gram in S. Since you found this part interesting, you decide to complete it soon. Note that the occurrences can partially overlap. That is, the string bcbcbc has two occurrences of the substring cbc.
Input
Input consists of several test cases. Each test case begins with a line specifying the string S. S will contain no more than 1,000 characters. S can contain any of the printable ASCII characters. Capitalization should be ignored while finding the N-Grams. The next line specifies an integer T. Next T lines give the values for N, 0< N£ Length(S).
Output
For each test case, output T lines specifying
the number of occurrences of the N-Gram followed by the N-Gram
itself separated by exactly one
space. If there are several such N-Grams, output the least lexicographical
one, compared in terms of ASCII values.
Note : Given two strings a=a0a1a2...am
and b=b0b1b2...bm
over the same alphabet V, we say that a is lexicographically
less than b if there exist an integer j, 0£
j£
m, so that ai=bi for all i=0,1,...,j-1
and aj<
bj.
Sample Input
In theory, there is no difference between theory and practice, but in practice, there is. 2 4 9
Sample Output
4 the 2 practice
題目要求找到長度為 N 的頻率最高的子字串。
這題很坑爹的是,沒有看到' '(空白字元)出現在 the 跟 practice 前面,一度以為題目有問題。
建造高度數組出來,然後掃描高度數組,最長區間都大於等於 N 的最小索引值。
但要特別處理 N = 1 的時候。
// 0.012s
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
struct SuffixArray {
int sa[1005], h[1005], n;
char str[1005];
int w[1005], ta[1005], tb[1005]; // buffer
void sort(int *x, int *y, int m) { // radix sort
static int i;
for(i = 0; i < m; i++)
w[i] = 0;
for(i = 0; i < n; i++)
w[x[y[i]]]++;
for(i = 1; i < m; i++)
w[i] += w[i-1];
for(i = n-1; i >= 0; i--)
sa[--w[x[y[i]]]] = y[i];
}
bool cmp(int *r, int a, int b, int l) {
if(r[a] == r[b]) {
if(a+l >= n || b+l >= n)
return false;
return r[a+l] == r[b+l];
}
return false;
}
void build_h() {
int i, j, k;
for(i = 0; i < n; i++) ta[sa[i]] = i;
for(i = 0; i < n; i++) {
if(ta[i] == 0) {
h[ta[i]] = 0;
continue;
}
if(i == 0 || h[ta[i-1]] <= 1)
k = 0;
else
k = h[ta[i-1]]-1;
while(str[sa[ta[i]-1]+k] == str[sa[ta[i]]+k])
k++;
h[ta[i]] = k;
}
}
void build() {// x: rank, y: second key(array index)
int i, k, m = 128, p;
int *x = ta, *y = tb, *z;
n = strlen(str);
x[n] = 0;
for(i = 0; i < n; i++)
x[i] = str[i], y[i] = i;
sort(x, y, m);
for(k = 1, p = 1; p < n; k *= 2, m = p) {
for(p = 0, i = n-k; i < n; i++)
y[p++] = i;
for(i = 0; i < n; i++) {
if(sa[i] >= k) {
y[p++] = sa[i]-k;
}
}
sort(x, y, m);
z = x, x = y, y = z;
for(i = 1, p = 1, x[sa[0]] = 0; i < n; i++)
x[sa[i]] = cmp(y, sa[i-1], sa[i], k) ? p-1 : p++;
}
}
};
int main() {
SuffixArray SA;
while(gets(SA.str)) {
SA.n = strlen(SA.str);
int T, N, i;
for(i = 0; i < SA.n; i++) {
if(SA.str[i] >= 'A' && SA.str[i] <= 'Z')
SA.str[i] = SA.str[i]-'A'+'a';
}
SA.build();
SA.build_h();
/*for(i = 0; i < SA.n; i++)
printf("%s\n%d\n", SA.str+SA.sa[i], SA.h[i]);*/
scanf("%d", &T);
while(T--) {
scanf("%d", &N);
int mx_f = 0, mx_f_arg = 0;
for(i = 0; i < SA.n; i++) {
if(SA.h[i] >= N) {
int cnt = 1, j = i;
while(j < SA.n && SA.h[j] >= N)
cnt++, j++;
if(cnt > mx_f) {
mx_f = cnt;
mx_f_arg = SA.sa[i-1];
}
i = j-1;
}
}
if(mx_f == 0) {
for(i = 0; i < SA.n; i++) {
if(SA.n - SA.sa[i] >= N) {
mx_f = 1;
mx_f_arg = SA.sa[i];
break;
}
}
}
printf("%d ", mx_f);
for(i = 0; i < N; i++)
putchar(SA.str[mx_f_arg+i]);
puts("");
}
while(getchar() != '\n');
}
return 0;
}