88
99def main (
1010 output_path ,
11- mongo_config
11+ mongo_config ,
12+ clid_start ,
13+ clid_end ,
14+ batch_size
1215):
1316 with open (mongo_config ) as r :
1417 config = json .load (r )
@@ -18,19 +21,36 @@ def main(
1821 clusters_collection_name = config ["clusters_collection_name" ]
1922 collection = client [database_name ][clusters_collection_name ]
2023
21- clusters = list (collection .find ({}))
22- clusters .sort (key = lambda x : x ["annotation_doc" ]["pub_time" ])
24+ if not clid_start :
25+ first_cluster = collection .find_one (sort = [("clid" , 1 )])
26+ clid_start = first_cluster ["clid" ]
27+ if not clid_end :
28+ last_cluster = collection .find_one (sort = [("clid" , - 1 )])
29+ clid_end = last_cluster ["clid" ] + 1
30+ print (f"Start clid: { clid_start } " )
31+ print (f"End clid: { clid_end } " )
32+
33+ current_clid_start = clid_start
2334 with open (output_path , "w" ) as w :
24- for cluster in clusters :
25- cluster .pop ("_id" )
26- cluster ["annotation_doc" ].pop ("embedding" , None )
27- cluster ["annotation_doc" ].pop ("embedded_images" , None )
28- w .write (json .dumps (cluster , ensure_ascii = False ) + "\n " )
35+ while current_clid_start < clid_end :
36+ print (clid_end - current_clid_start )
37+ current_clid_end = current_clid_start + batch_size
38+ clusters = list (collection .find ({"clid" : {"$gte" : current_clid_start , "$lt" : current_clid_end }}))
39+ clusters .sort (key = lambda x : x ["annotation_doc" ]["pub_time" ])
40+ for cluster in clusters :
41+ cluster .pop ("_id" )
42+ cluster ["annotation_doc" ].pop ("embedding" , None )
43+ cluster ["annotation_doc" ].pop ("embedded_images" , None )
44+ w .write (json .dumps (cluster , ensure_ascii = False ) + "\n " )
45+ current_clid_start = current_clid_end
2946
3047
3148if __name__ == "__main__" :
3249 parser = argparse .ArgumentParser ()
3350 parser .add_argument ("--output-path" , type = str , default = "data/clusters.jsonl" )
3451 parser .add_argument ("--mongo-config" , type = str , default = "configs/mongo_config.json" )
52+ parser .add_argument ("--clid-start" , type = int , default = None )
53+ parser .add_argument ("--clid-end" , type = int , default = None )
54+ parser .add_argument ("--batch-size" , type = int , default = 1000 )
3555 args = parser .parse_args ()
3656 main (** vars (args ))
0 commit comments