Research talk:Revision scoring as a service/Work log/2015-07-14
Add topicAppearance
Latest comment: 9 years ago by EpochFail in topic Tuesday, July 14, 2015
Tuesday, July 14, 2015
[edit]Today, I am removing non-reverted edits from the edit quality wikilabels dataset.
Luckily, it appears as though I have already generated revert labels for the datasets. I can just post-process this dataset to get the revision ids that were not reverted.
$ (cat rev_reverted.20k_sample.enwiki.tsv | grep False | cut -f1 | sed -r "s/^/enwiki\t/";\ cat rev_reverted.20k_sample.fawiki.tsv | grep False | cut -f1 | sed -r "s/^/fawiki\t/";\ cat rev_reverted.20k_sample.ptwiki.tsv | grep False | cut -f1 | sed -r "s/^/ptwiki\t/";\ cat rev_reverted.20k_sample.trwiki.tsv | grep False | cut -f1 | sed -r "s/^/trwiki\t/") > \ rev_wiki.non_reverted.tsv $ wc rev_wiki.non_reverted.tsv 74766 149532 1215235 rev_wiki.non_reverted.tsv
Looks like we can filter out about 75k our of 80k edits! That's a lot. --EpochFail (talk) 22:18, 14 July 2015 (UTC)
OK. Now to actually apply the filtering.
wikilabels=> CREATE TEMPORARY TABLE rev_ids_to_filter (wiki VARCHAR(50), rev_id INT);
CREATE TABLE
wikilabels=> \copy rev_ids_to_filter FROM './rev_wiki.non_reverted.tsv'
wikilabels=> ;
wikilabels=> select count(*) FROM rev_ids_to_filter;
count
-------
74766
(1 row)
wikilabels=> select * from rev_ids_to_filter limit 2;
wiki | rev_id
--------+-----------
enwiki | 644933637
enwiki | 629393521
(2 rows)
wikilabels=> \d workset
Table "public.workset"
Column | Type | Modifiers
-------------+-----------------------------+------------------------------------------------------
id | integer | not null default nextval('workset_id_seq'::regclass)
campaign_id | integer |
user_id | integer |
created | timestamp without time zone |
expires | timestamp without time zone |
Indexes:
"workset_pkey" PRIMARY KEY, btree (id)
"workset_user" btree (user_id)
wikilabels=> select id, wiki, name from campaign where active order by id;
id | wiki | name
----+--------+-------------------------------------------------------
4 | enwiki | Edit quality (20k random sample, 2015)
5 | trwiki | Değişiklik kalitesi (20,000 rastgele örnekleme, 2015)
6 | fawiki | کیفیت ویرایش (نمونه تصادفی ۲۰ هزارتایی، ۲۰۱۵)
7 | ptwiki | Qualidade das edições (amostra de 20k revisões, 2015)
8 | azwiki | Edit quality (20k random sample, 2015)
9 | frwiki | Modifier la qualité (20k échantillon aléatoire, 2015)
(6 rows)
wikilabels=> INSERT INTO workset (campaign_id, user_id, created, expires) VALUES (4, 41948920, NOW(), NOW()) RETURNING id;
id
-----
232
(1 row)
INSERT 0 1
wikilabels=> INSERT INTO workset (campaign_id, user_id, created, expires) VALUES (5, 41948920, NOW(), NOW()) RETURNING id;
id
-----
233
(1 row)
INSERT 0 1
wikilabels=> INSERT INTO workset (campaign_id, user_id, created, expires) VALUES (7, 41948920, NOW(), NOW()) RETURNING id;
id
-----
234
(1 row)
INSERT 0 1
wikilabels=> INSERT INTO workset (campaign_id, user_id, created, expires) VALUES (6, 41948920, NOW(), NOW()) RETURNING id;
id
-----
235
(1 row)
INSERT 0 1
wikilabels=> \d workset_task
Table "public.workset_task"
Column | Type | Modifiers
------------+---------+-----------
workset_id | integer | not null
task_id | integer | not null
Indexes:
"workset_task_pkey" PRIMARY KEY, btree (workset_id, task_id)
Foreign-key constraints:
"workset_task_task_id_fkey" FOREIGN KEY (task_id) REFERENCES task(id)
wikilabels=> SELECT 232, task.id FROM task WHERE campaign_id = 4 AND (task.data->'rev_id')::text::int IN (SELECT rev_id FROM rev_ids_to_filter WHERE wiki = 'enwiki' LIMIT 10);
?column? | id
----------+--------
232 | 180077
232 | 180078
232 | 180079
232 | 180080
232 | 180081
232 | 180082
232 | 180085
232 | 180086
232 | 180087
232 | 180089
(10 rows)
wikilabels=> SELECT COUNT(*) FROM label WHERE (data->>'automatic')::bool;
count
-------
43434
(1 row)
wikilabels=> UPDATE label SET data = '{"damaging":false,"goodfaith":true,"unsure":true,"automatic":"advanced-rights"}'::json WHERE (data->>'automatic')::bool;
UPDATE 43434
wikilabels=> SELECT 232, task.id FROM task WHERE campaign_id = 4 AND (task.data->'rev_id')::text::int IN (SELECT rev_id FROM rev_ids_to_filter WHERE wiki = 'enwiki') AND task.id NOT IN (SELECT task_id FROM label) LIMIT 10;
?column? | id
----------+--------
232 | 199661
232 | 199663
232 | 199665
232 | 199666
232 | 199667
232 | 199670
232 | 199673
232 | 199683
232 | 199684
232 | 199686
(10 rows)
wikilabels=> INSERT INTO workset_task SELECT 232, task.id FROM task WHERE campaign_id = 4 AND (task.data->'rev_id')::text::int IN (SELECT rev_id FROM rev_ids_to_filter WHERE wiki = 'enwiki') AND task.id NOT IN (SELECT task_id FROM label);
INSERT 0 7374
wikilabels=> INSERT INTO workset_task SELECT 233, task.id FROM task WHERE campaign_id = 5 AND (task.data->'rev_id')::text::int IN (SELECT rev_id FROM rev_ids_to_filter WHERE wiki = 'trwiki') AND task.id NOT IN (SELECT task_id FROM label);
INSERT 0 12183
wikilabels=> INSERT INTO workset_task SELECT 235, task.id FROM task WHERE campaign_id = 6 AND (task.data->'rev_id')::text::int IN (SELECT rev_id FROM rev_ids_to_filter WHERE wiki = 'fawiki') AND task.id NOT IN (SELECT task_id FROM label);
INSERT 0 1628
wikilabels=> INSERT INTO workset_task SELECT 234, task.id FROM task WHERE campaign_id = 7 AND (task.data->'rev_id')::text::int IN (SELECT rev_id FROM rev_ids_to_filter WHERE wiki = 'ptwiki') AND task.id NOT IN (SELECT task_id FROM label);
INSERT 0 4654
wikilabels=> SELECT task_id, user_id, NOW(), '{"damaging":false,"goodfaith":true,"unsure":true,"automatic":true}'::json FROM workset INNER JOIN workset_task ON workset.id = workset_id WHERE workset.id IN (232, 233, 234, 235^C
wikilabels=> SELECT task_id, user_id, NOW(), '{"damaging":false,"goodfaith":true,"unsure":true,"automatic":"not-reverted"}'::json FROM workset INNER JOIN workset_task ON workset.id = workset_id WHERE workset.id IN (232, 233, 234, 235) LIMIT 10;
wikilabels=> INSERT INTO label SELECT task_id, user_id, NOW(), '{"damaging":false,"goodfaith":true,"unsure":true,"automatic":"not-reverted"}'::json FROM workset INNER JOIN workset_task ON workset.id = workset_id WHERE workset.id IN (232, 233, 234, 235);
INSERT 0 25839
Woot! Now time to tell the labell\ers that we've minimized their work load! --EpochFail (talk) 00:11, 15 July 2015 (UTC)