vectorize.py 845 B

1234567891011121314151617181920
  1. import argparse
  2. from utils.data import traverse
  3. from utils.vector import save_vectors
  4. def parse_arguments():
  5. parser = argparse.ArgumentParser()
  6. parser.add_argument('--workspace', type=str, help="directory of the workspace to be vectorized", default='.')
  7. parser.add_argument('--lines_per_chunk', type=int, help="chunk lines when splitting", default=40)
  8. parser.add_argument('--lines_overlap', type=int, help="chunk lines overlap when splitting", default=15)
  9. parser.add_argument("--max_chars", type=int, help="maximum number of characters in a chunk", default=1500)
  10. parser.add_argument('--output_path', type=str, help="path to save the vectors", default='vectors')
  11. return parser.parse_args()
  12. if __name__ == '__main__':
  13. args = parse_arguments()
  14. files = traverse(args.workspace)
  15. save_vectors(files, args)