首页 > 代码库 > poj3693 Maximum repetition substring 后缀数组

poj3693 Maximum repetition substring 后缀数组

http://poj.org/problem?id=3693

Maximum repetition substring
Time Limit: 1000MS Memory Limit: 65536K
Total Submissions: 7241 Accepted: 2162

Description

The repetition number of a string is defined as the maximum number R such that the string can be partitioned into R same consecutive substrings. For example, the repetition number of "ababab" is 3 and "ababa" is 1.

Given a string containing lowercase letters, you are to find a substring of it with maximum repetition number.

Input

The input consists of multiple test cases. Each test case contains exactly one line, which
gives a non-empty string consisting of lowercase letters. The length of the string will not be greater than 100,000.

The last test case is followed by a line containing a ‘#‘.

Output

For each test case, print a line containing the test case number( beginning with 1) followed by the substring of maximum repetition number. If there are multiple substrings of maximum repetition number, print the lexicographically smallest one.

Sample Input

ccabababc
daabbccaa
#

Sample Output

Case 1: ababab
Case 2: aa

Source

2008 Asia Hefei Regional Contest Online by USTC


题意:求这个字符串内部循环重复次数最多的子串。输出字典序最小的。

思路:论文上的经典题。


就是先枚举长度L,表示长度为L的子串循环,肯定可以至少出现一次,出现两次以上枚举的时候肯定有s[i]==s[i+L],然后求这两个i和i+L后缀的最长公共前缀,记为k,那么至少出现了K/L+1,然后是是s[i]往前补够看能否多匹配一个。这样就得到了长度为L的子串最大循环次数。

对于求字典序最小的,我们枚举sa数组,判断sa[i],sa[i+L]后缀的最长公共前缀是否满足,遇到第一个满足就是字典序最小的。

/**
 * @author neko01
 */
//#pragma comment(linker, "/STACK:102400000,102400000")
#include <cstdio>
#include <cstring>
#include <string.h>
#include <iostream>
#include <algorithm>
#include <queue>
#include <vector>
#include <cmath>
#include <set>
#include <map>
using namespace std;
typedef long long LL;
#define min3(a,b,c) min(a,min(b,c))
#define max3(a,b,c) max(a,max(b,c))
#define pb push_back
#define mp(a,b) make_pair(a,b)
#define clr(a) memset(a,0,sizeof a)
#define clr1(a) memset(a,-1,sizeof a)
#define dbg(a) printf("%d\n",a)
typedef pair<int,int> pp;
const double eps=1e-9;
const double pi=acos(-1.0);
const int N=100005;
int sa[N]; //排第几的是哪个后缀
//sa[1~n]为有效值,sa[0]必定为n是无效值
int rank[N]; //rank后缀i排第几
//rank[0~n-1]为有效值,rank[n]必定为0无效值
int height[N]; //sa[i]和sa[i-1]的最长公共前缀
//height[2~n]为有效值
int t1[N],t2[N],c[N];
int dp[N][20];  //rmp数组
void build_sa(int s[],int n,int m)
{
    int *x=t1,*y=t2;
    //第一轮计数排序
    for(int i=0;i<m;i++) c[i]=0;
    for(int i=0;i<n;i++) c[x[i]=s[i]]++;
    for(int i=1;i<m;i++) c[i]+=c[i-1];
    for(int i=n-1;i>=0;i--) sa[--c[x[i]]]=i;
    for(int j=1;j<=n;j<<=1)
    {
        int p=0;
        //直接利用sa数组排序第二关键字
        for(int i=n-j;i<n;i++) y[p++]=i;
        for(int i=0;i<n;i++)
            if(sa[i]>=j) y[p++]=sa[i]-j;
        //计数排序第一关键字
        for(int i=0;i<m;i++) c[i]=0;
        for(int i=0;i<n;i++) c[x[y[i]]]++;
        for(int i=1;i<m;i++) c[i]+=c[i-1];
        for(int i=n-1;i>=0;i--) sa[--c[x[y[i]]]]=y[i];
        //根据sa和x数组计算新的x数组
        swap(x,y);
        p=1,x[sa[0]]=0;
        for(int i=1;i<n;i++)
            x[sa[i]]=y[sa[i-1]]==y[sa[i]]&&y[sa[i-1]+j]==y[sa[i]+j]?p-1:p++;
        if(p>=n) break;
        m=p;
    }
}
void getheight(int s[],int n)
{
    int k=0;
    for(int i=0;i<=n;i++)
        rank[sa[i]]=i;
    for(int i=0;i<n;i++)
    {
        if(k) k--;
        int j=sa[rank[i]-1];
        while(s[i+k]==s[j+k]) k++;
        height[rank[i]]=k;
    }
}
void initrmq(int n)
{
    for(int i=1;i<=n;i++) dp[i][0]=height[i];
    for(int j=1;(1<<j)<=n;j++)
        for(int i=1;i+(1<<j)-1<=n;i++)
            dp[i][j]=min(dp[i][j-1],dp[i+(1<<(j-1))][j-1]);
}
int lcp(int l,int r)
{
    int k=0;
    l=rank[l];
    r=rank[r];
    if(l>r) swap(l,r);
    l++;
    while((1<<(k+1))<=r-l+1) k++;
    return min(dp[l][k],dp[r-(1<<k)+1][k]);
}
char s[N];
int a[N]; //待排序数组长度为n,放在0~n-1中,在最后面补一个0
int res[N];
int main()
{
    int cnt=0;
    while(~scanf("%s",s))
    {
        if(strcmp(s,"#")==0) break;
        int n=strlen(s);
        for(int i=0;i<n;i++)
            a[i]=s[i]-'a'+1;
        a[n]=0;
        build_sa(a,n+1,28);
        getheight(a,n);
        initrmq(n);
        int ans=0,tot=0;  //ans为最多可出现次数
        for(int l=1;l<=n/2;l++)    //枚举l长度子串可以匹配
        {
            for(int i=0;i+l<n;i+=l)
            {
                if(s[i]!=s[i+l]) continue;
                int k=lcp(i,i+l);
                int t=k/l+1;
                int r=i-(l-k%l);
                //printf("%d %d %d %d\n",i,i+l,k,k/l+1);
                if(r>=0&&lcp(r,r+l)>=k)  //往前看能否还能匹配一个
                    t++;
                if(t>ans)
                {
                    ans=t;
                    tot=0;
                    res[tot++]=l;
                }
                else if(ans==t)
                    res[tot++]=l;
            }
        }
        int len=0,ss;
        for(int i=1;i<=n&&!len;i++)  //枚举sa数组找字典序最小的
        {
            for(int j=0;j<tot;j++)
            {
                int l=res[j];
                if(lcp(sa[i],sa[i]+l)>=(ans-1)*l)
                {
                    len=l;
                    ss=sa[i];
                    break;
                }
            }
        }
        s[ss+len*ans]='\0';
        printf("Case %d: %s\n",++cnt,s+ss);
    }
    return 0;
}



poj3693 Maximum repetition substring 后缀数组