Jump to content

Research talk:Revision scoring as a service/Work log/2015-07-14

Add topic
From Meta, a Wikimedia project coordination wiki

Tuesday, July 14, 2015

[edit]

Today, I am removing non-reverted edits from the edit quality wikilabels dataset.

Luckily, it appears as though I have already generated revert labels for the datasets. I can just post-process this dataset to get the revision ids that were not reverted.

$ (cat rev_reverted.20k_sample.enwiki.tsv | grep False | cut -f1 | sed -r "s/^/enwiki\t/";\
cat rev_reverted.20k_sample.fawiki.tsv | grep False | cut -f1 | sed -r "s/^/fawiki\t/";\
cat rev_reverted.20k_sample.ptwiki.tsv | grep False | cut -f1 | sed -r "s/^/ptwiki\t/";\
cat rev_reverted.20k_sample.trwiki.tsv | grep False | cut -f1 | sed -r "s/^/trwiki\t/") > \
rev_wiki.non_reverted.tsv

$ wc rev_wiki.non_reverted.tsv 
  74766  149532 1215235 rev_wiki.non_reverted.tsv

Looks like we can filter out about 75k our of 80k edits! That's a lot. --EpochFail (talk) 22:18, 14 July 2015 (UTC)Reply

OK. Now to actually apply the filtering.

wikilabels=> CREATE TEMPORARY TABLE rev_ids_to_filter (wiki VARCHAR(50), rev_id INT);
CREATE TABLE
wikilabels=> \copy rev_ids_to_filter FROM './rev_wiki.non_reverted.tsv'
wikilabels=> ;
wikilabels=> select count(*) FROM rev_ids_to_filter;
 count 
-------
 74766
(1 row)

wikilabels=> select * from rev_ids_to_filter limit 2;
  wiki  |  rev_id   
--------+-----------
 enwiki | 644933637
 enwiki | 629393521
(2 rows)

wikilabels=> \d workset
                                      Table "public.workset"
   Column    |            Type             |                      Modifiers                       
-------------+-----------------------------+------------------------------------------------------
 id          | integer                     | not null default nextval('workset_id_seq'::regclass)
 campaign_id | integer                     | 
 user_id     | integer                     | 
 created     | timestamp without time zone | 
 expires     | timestamp without time zone | 
Indexes:
    "workset_pkey" PRIMARY KEY, btree (id)
    "workset_user" btree (user_id)

wikilabels=> select id, wiki, name from campaign where active order by id;
 id |  wiki  |                         name                          
----+--------+-------------------------------------------------------
  4 | enwiki | Edit quality (20k random sample, 2015)
  5 | trwiki | Değişiklik kalitesi (20,000 rastgele örnekleme, 2015)
  6 | fawiki | کیفیت ویرایش (نمونه تصادفی ۲۰ هزارتایی، ۲۰۱۵)
  7 | ptwiki | Qualidade das edições (amostra de 20k revisões, 2015)
  8 | azwiki | Edit quality (20k random sample, 2015)
  9 | frwiki | Modifier la qualité (20k échantillon aléatoire, 2015)
(6 rows)

wikilabels=> INSERT INTO workset (campaign_id, user_id, created, expires) VALUES (4, 41948920, NOW(), NOW()) RETURNING id;
 id  
-----
 232
(1 row)

INSERT 0 1
wikilabels=> INSERT INTO workset (campaign_id, user_id, created, expires) VALUES (5, 41948920, NOW(), NOW()) RETURNING id;
 id  
-----
 233
(1 row)

INSERT 0 1
wikilabels=> INSERT INTO workset (campaign_id, user_id, created, expires) VALUES (7, 41948920, NOW(), NOW()) RETURNING id;
 id  
-----
 234
(1 row)

INSERT 0 1
wikilabels=> INSERT INTO workset (campaign_id, user_id, created, expires) VALUES (6, 41948920, NOW(), NOW()) RETURNING id;
 id  
-----
 235
(1 row)

INSERT 0 1
wikilabels=> \d workset_task
   Table "public.workset_task"
   Column   |  Type   | Modifiers 
------------+---------+-----------
 workset_id | integer | not null
 task_id    | integer | not null
Indexes:
    "workset_task_pkey" PRIMARY KEY, btree (workset_id, task_id)
Foreign-key constraints:
    "workset_task_task_id_fkey" FOREIGN KEY (task_id) REFERENCES task(id)

wikilabels=> SELECT 232, task.id FROM task WHERE campaign_id = 4 AND (task.data->'rev_id')::text::int IN (SELECT rev_id FROM rev_ids_to_filter WHERE wiki = 'enwiki' LIMIT 10);
 ?column? |   id   
----------+--------
      232 | 180077
      232 | 180078
      232 | 180079
      232 | 180080
      232 | 180081
      232 | 180082
      232 | 180085
      232 | 180086
      232 | 180087
      232 | 180089
(10 rows)

wikilabels=> SELECT COUNT(*) FROM label WHERE (data->>'automatic')::bool;
 count 
-------
 43434
(1 row)

wikilabels=> UPDATE label SET data = '{"damaging":false,"goodfaith":true,"unsure":true,"automatic":"advanced-rights"}'::json WHERE (data->>'automatic')::bool;
UPDATE 43434
wikilabels=> SELECT 232, task.id FROM task WHERE campaign_id = 4 AND (task.data->'rev_id')::text::int IN (SELECT rev_id FROM rev_ids_to_filter WHERE wiki = 'enwiki') AND task.id NOT IN (SELECT task_id FROM label) LIMIT 10;
 ?column? |   id   
----------+--------
      232 | 199661
      232 | 199663
      232 | 199665
      232 | 199666
      232 | 199667
      232 | 199670
      232 | 199673
      232 | 199683
      232 | 199684
      232 | 199686
(10 rows)

wikilabels=> INSERT INTO workset_task SELECT 232, task.id FROM task WHERE campaign_id = 4 AND (task.data->'rev_id')::text::int IN (SELECT rev_id FROM rev_ids_to_filter WHERE wiki = 'enwiki') AND task.id NOT IN (SELECT task_id FROM label);
INSERT 0 7374
wikilabels=> INSERT INTO workset_task SELECT 233, task.id FROM task WHERE campaign_id = 5 AND (task.data->'rev_id')::text::int IN (SELECT rev_id FROM rev_ids_to_filter WHERE wiki = 'trwiki') AND task.id NOT IN (SELECT task_id FROM label);
INSERT 0 12183
wikilabels=> INSERT INTO workset_task SELECT 235, task.id FROM task WHERE campaign_id = 6 AND (task.data->'rev_id')::text::int IN (SELECT rev_id FROM rev_ids_to_filter WHERE wiki = 'fawiki') AND task.id NOT IN (SELECT task_id FROM label);
INSERT 0 1628
wikilabels=> INSERT INTO workset_task SELECT 234, task.id FROM task WHERE campaign_id = 7 AND (task.data->'rev_id')::text::int IN (SELECT rev_id FROM rev_ids_to_filter WHERE wiki = 'ptwiki') AND task.id NOT IN (SELECT task_id FROM label);
INSERT 0 4654
wikilabels=> SELECT task_id, user_id, NOW(), '{"damaging":false,"goodfaith":true,"unsure":true,"automatic":true}'::json FROM workset INNER JOIN workset_task ON workset.id = workset_id WHERE workset.id IN (232, 233, 234, 235^C
wikilabels=> SELECT task_id, user_id, NOW(), '{"damaging":false,"goodfaith":true,"unsure":true,"automatic":"not-reverted"}'::json FROM workset INNER JOIN workset_task ON workset.id = workset_id WHERE workset.id IN (232, 233, 234, 235) LIMIT 10;
wikilabels=> INSERT INTO label SELECT task_id, user_id, NOW(), '{"damaging":false,"goodfaith":true,"unsure":true,"automatic":"not-reverted"}'::json FROM workset INNER JOIN workset_task ON workset.id = workset_id WHERE workset.id IN (232, 233, 234, 235);
INSERT 0 25839

Woot! Now time to tell the labell\ers that we've minimized their work load! --EpochFail (talk) 00:11, 15 July 2015 (UTC)Reply