mirror of
				https://github.com/Garmelon/PFERD.git
				synced 2025-11-04 06:32:52 +01:00 
			
		
		
		
	Add regex option to config and CLI parser
This commit is contained in:
		@@ -138,11 +138,16 @@ crawler simulate a slower, network-based crawler.
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
### The `kit-ipd` crawler
 | 
					### The `kit-ipd` crawler
 | 
				
			||||||
 | 
					
 | 
				
			||||||
This crawler crawls a KIT ipd page by url. The root page can be crawled from
 | 
					This crawler crawls a KIT-IPD page by url. The root page can be crawled from
 | 
				
			||||||
outside the KIT network so you will be informed about any new/deleted files,
 | 
					outside the KIT network so you will be informed about any new/deleted files,
 | 
				
			||||||
but downloading files requires you to be within. Adding a show delay between
 | 
					but downloading files requires you to be within. Adding a show delay between
 | 
				
			||||||
requests is likely a good idea.
 | 
					requests is likely a good idea.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					- `target`: URL to a KIT-IPD page
 | 
				
			||||||
 | 
					- `link_regex`: A regex that is matched against the `href` part of links. If it
 | 
				
			||||||
 | 
					  matches, the given link is downloaded as a file. This is used to extract
 | 
				
			||||||
 | 
					  files from KIT-IPD pages. (Default: `^.*/[^/]*\.(?:pdf|zip|c|java)$`)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
### The `kit-ilias-web` crawler
 | 
					### The `kit-ilias-web` crawler
 | 
				
			||||||
 | 
					
 | 
				
			||||||
This crawler crawls the KIT ILIAS instance.
 | 
					This crawler crawls the KIT ILIAS instance.
 | 
				
			||||||
 
 | 
				
			|||||||
							
								
								
									
										3
									
								
								LICENSE
									
									
									
									
									
								
							
							
						
						
									
										3
									
								
								LICENSE
									
									
									
									
									
								
							@@ -1,4 +1,5 @@
 | 
				
			|||||||
Copyright 2019-2021 Garmelon, I-Al-Istannen, danstooamerican, pavelzw, TheChristophe, Scriptim, thelukasprobst
 | 
					Copyright 2019-2021 Garmelon, I-Al-Istannen, danstooamerican, pavelzw,
 | 
				
			||||||
 | 
					                    TheChristophe, Scriptim, thelukasprobst, Toorero
 | 
				
			||||||
 | 
					
 | 
				
			||||||
Permission is hereby granted, free of charge, to any person obtaining a copy of
 | 
					Permission is hereby granted, free of charge, to any person obtaining a copy of
 | 
				
			||||||
this software and associated documentation files (the "Software"), to deal in
 | 
					this software and associated documentation files (the "Software"), to deal in
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -14,6 +14,12 @@ GROUP = SUBPARSER.add_argument_group(
 | 
				
			|||||||
    title="kit ipd crawler arguments",
 | 
					    title="kit ipd crawler arguments",
 | 
				
			||||||
    description="arguments for the 'kit-ipd' crawler",
 | 
					    description="arguments for the 'kit-ipd' crawler",
 | 
				
			||||||
)
 | 
					)
 | 
				
			||||||
 | 
					GROUP.add_argument(
 | 
				
			||||||
 | 
					    "--link-regex",
 | 
				
			||||||
 | 
					    type=str,
 | 
				
			||||||
 | 
					    metavar="REGEX",
 | 
				
			||||||
 | 
					    help="href-matching regex to identify downloadable files"
 | 
				
			||||||
 | 
					)
 | 
				
			||||||
GROUP.add_argument(
 | 
					GROUP.add_argument(
 | 
				
			||||||
    "target",
 | 
					    "target",
 | 
				
			||||||
    type=str,
 | 
					    type=str,
 | 
				
			||||||
@@ -41,6 +47,8 @@ def load(
 | 
				
			|||||||
    section["type"] = "kit-ipd"
 | 
					    section["type"] = "kit-ipd"
 | 
				
			||||||
    section["target"] = str(args.target)
 | 
					    section["target"] = str(args.target)
 | 
				
			||||||
    section["output_dir"] = str(args.output)
 | 
					    section["output_dir"] = str(args.output)
 | 
				
			||||||
 | 
					    if args.link_regex:
 | 
				
			||||||
 | 
					        section["link_regex"] = str(args.link_regex)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
SUBPARSER.set_defaults(command=load)
 | 
					SUBPARSER.set_defaults(command=load)
 | 
				
			||||||
 
 | 
				
			|||||||
		Reference in New Issue
	
	Block a user