Jump to content

Gfdl-wikititle

From Meta, a Wikimedia project coordination wiki

gfdl-wikititle is a "C" language based utility written by Jeffrey Vernon Merkey of the Wolf Mountain Group that supports insertion of interwiki links for articles contained in an XML dump provided by the Wikimedia Foundation. This utility also has the ability to strip out and number all article titles and fix and correct bad titles which may cause importDump.php to crash during database import of the standard Wikimedia XML Dumps published by the Foundation.

The program source code is released under the GNU Public License, or any later version. To build the program, cut and paste the source files and save them under each name, and compile the program.

Because the program is written in "C" and compiled as machine code instead of PHP based scripts, it runs at many orders of magnitude faster than script based programs, such as PHP or perl. gfdl-wikititle can process a 10GB uncompressed XML dump in a matter of minutes rather than several hours, which is more typical of PHP programs.

This program will also allow you to insert links into your dump which point back into the English Wikipedia or to whatever URL you wish to link each article to. It is Required by the GFDL License that you attribute authorship to articles contained in the dumps. The preferred method recommended by the Wikimedia Foundation and Wikipedia Community is to post the text of the GFDL license on your MediaWiki site then link each article back to the parent article on Wikipedia. This is most easily accomplished by inserting an interwiki language link into each article.


Source Code

[edit]

Makefile

CFLAGS = -Wno-pointer-sign -O2 -g
CC = gcc

all:  gfdl-wikititle

gfdl-wikititle: gfdl-wikititle.c platform.h
	$(CC) $(CFLAGS) gfdl-wikititle.c -o gfdl-wikititle

clean:
	rm -f *.o gfdl-wikititle 

install: all
	install -m 755 gfdl-wikititle /usr/local/bin

platform.h

#define LINUX

gfdl-wikititle.c

#include "platform.h"

#ifdef WINDOWS

#define strncasecmp strnicmp

#include "windows.h"
#include "winioctl.h"
#include "winuser.h"
#include "stdarg.h"
typedef UCHAR BYTE;
typedef USHORT WORD;
#include "stdio.h"
#include "stdlib.h"
#include "ctype.h"
#include "conio.h"

#endif

#ifdef LINUX

#include <unistd.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#endif

#define STRIP_META 1
#define STRIP_TEMPLATES 1

unsigned char language_default[] = "en";
unsigned long count = 0;

int main(int argc, char *argv[])
{
    char *s, *title, *title_p, *buffer;
    int inpage = 0, text = 0, url = 0, strip = 0, fix = 0, i;
    int badtitlelen = 0, badtitlechar = 0;
    unsigned char *code;

    code = language_default;
    for (i=1; i < argc; i++)
    {
       if (!strcmp(argv[i], "-h"))
       {
          printf("USAGE:  gfdl-wikititle [-hsu|num] < src.xml > target.xml\n");
          printf("            -h              - this help screen\n");
          printf("            -code <num>     - language code (en,de,etc)\n");
          printf("            -s string       - custom insertion string\n");
          printf("            -u string       - custom url string\n");
          printf("            -strip          - strip/number title names\n");
          printf("            -badtitlelen    - show titles > 256 bytes,\n");
          printf("            -badtitlechar   - show titles that contain\n");
          printf("                              multiple ':' and '/' chars\n");
          printf("            -fix            - fix broken titles\n");
          printf("            src.xml         - source xml file\n");
          printf("            target.xml      - output xml file\n");
          exit(0);
       }

       if (!strcmp(argv[i], "-fix"))
       {
          fix = 1;
          continue;
       }

       if (!strcmp(argv[i], "-strip"))
       {
          strip = 1;
          continue;
       }

       if (!strcmp(argv[i], "-badtitlechar"))
       {
          strip = 1;
          badtitlechar = 1;
          continue;
       }

       if (!strcmp(argv[i], "-badtitlelen"))
       {
          strip = 1;
          badtitlelen = 1;
          continue;
       }

       if (!strcmp(argv[i], "-s"))
       {
          text = 1;
          continue;
       }

       if (!strcmp(argv[i], "-u"))
       {
          url = 1;
          continue;
       }

       if (!strcmp(argv[i], "-code"))
       {
          i++;
          if (argv[i])
             code = argv[i];
          else
          {
             fprintf(stderr, "gfdl-wikititle: language code expected\n");
             exit(1);
          }
          continue;
       }
    }

#define BUFFER_SIZE 65536

    buffer = malloc(BUFFER_SIZE);
    if (!buffer)
    {
       fprintf(stderr, "gfdl-wikititle:  could not allocate buffer workspace\n");
       exit(1);
    }
    *buffer = '\0';


    title = malloc(BUFFER_SIZE);
    if (!title)
    {
       fprintf(stderr, "gfdl-wikititle:  could not allocate namespace\n");
       exit(1);
    }
    *title = '\0';

    while ((s = fgets(buffer, BUFFER_SIZE, stdin)))
    {
       if (strstr(s, "<page>"))
       {
          inpage++;

          *title = '\0';

          if (!strip)
            printf("%s", s);
          continue;
       }

       if (strstr(s, "</page>"))
       {
          if (inpage)
             inpage--;

          *title = '\0';

          if (!strip)
            printf("%s", s);
          continue;
       }

       title_p = strstr(s, "<title>");
       if (inpage && title_p)
       {
          char *ts, *tp;

          ts = title_p + 7; /* length of "<title>" */
          tp = strstr(ts, "</title>");
          if (tp)
          {
             if (tp - ts)
             {
                strncpy(title, ts, tp - ts);
                title[tp - ts] = '\0';
                count++;

                if (strip)
                {
                   int j;
                   unsigned char *v;

                   if (badtitlechar || badtitlelen)
                   {
                      if (badtitlechar)
                      {
                         for (v = title, j = 0; *v; v++)
                         {
                            if (*v == ':')
                               j++;
                         }
                         if (j > 1)
                            printf("%d:%s\n", (int)count, title);
                      }

                      if (badtitlelen && strlen(title) > 256)
                         printf("%d:%s\n", (int)count, title);
                   }
                   else
                      printf("%d:%s\n", (int)count, title);
                   continue;
                }

/* do not link meta or mediawiki internal links. */
#if STRIP_META
                if (strstr(s, "<title>") && strstr(s, "MediaWiki:"))
                {
                   fputs(s, stdout);
                   continue;
                }
#endif
#if STRIP_TEMPLATES
                if (strstr(s, "<title>") && strstr(s, "Template:"))
                {
                   fputs(s, stdout);
                   continue;
                }
#endif
                if (fix && strlen(title) > 256)
                {
                   title[0] = '\0';
                   sprintf(title, "%d", (int)count);
                   printf("<title>gfdl-wikititle-%s</title>\n", title);
                   continue;
                }
             }
          }
       }

       while (!strip && *s)
       {
          if (inpage && *title && !memcmp(s, "</text>", 7))
          {
             if (url)
                printf("\n[%s%s]", argv[2], title);
             else if (text)
                printf("\n%s%s", argv[2], title);
             else
                printf("\n[[%s:%s]]", code, title);
             *title = '\0';
          }

          if (!*s || *s == '\n')
          {
             if (*s)
                fputc(*s++, stdout);
             break;
          }
          fputc(*s++, stdout);
       }
    }
    free(title);
    free(buffer);
    return 0;
}