I have tuned up several of my Windows server hosts by increasing the physical memory, swap space and adding the "Media Foundation" feature etc. These servers have been running Python tasks which are completing but I'm seeing about a 40% error rate which looks like a Python or programming error.
Traceback (most recent call last):
File "C:\ProgramData\BOINC\slots\18\lib\site-packages\pytorchrl\scheme\gradients\g_worker.py", line 196, in get_data
self.next_batch = self.batches.__next__()
StopIteration
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "run.py", line 473, in <module>
main()
File "run.py", line 129, in main
learner.step()
File "C:\ProgramData\BOINC\slots\18\lib\site-packages\pytorchrl\learner.py", line 48, in step
info = self.update_worker.step()
File "C:\ProgramData\BOINC\slots\18\lib\site-packages\pytorchrl\scheme\updates\u_worker.py", line 118, in step
self.updater.step()
File "C:\ProgramData\BOINC\slots\18\lib\site-packages\pytorchrl\scheme\updates\u_worker.py", line 259, in step
grads = self.local_worker.step(self.decentralized_update_execution)
File "C:\ProgramData\BOINC\slots\18\lib\site-packages\pytorchrl\scheme\gradients\g_worker.py", line 178, in step
self.get_data()
File "C:\ProgramData\BOINC\slots\18\lib\site-packages\pytorchrl\scheme\gradients\g_worker.py", line 211, in get_data
self.collector.step()
File "C:\ProgramData\BOINC\slots\18\lib\site-packages\pytorchrl\scheme\gradients\g_worker.py", line 490, in step
rollouts = self.local_worker.collect_data(listen_to=["sync"], data_to_cpu=False)
File "C:\ProgramData\BOINC\slots\18\lib\site-packages\pytorchrl\scheme\collection\c_worker.py", line 171, in collect_data
train_info = self.collect_train_data(listen_to=listen_to)
File "C:\ProgramData\BOINC\slots\18\lib\site-packages\pytorchrl\scheme\collection\c_worker.py", line 244, in collect_train_data
obs2, reward, done2, episode_infos = self.envs_train.step(clip_act)
File "C:\ProgramData\BOINC\slots\18\lib\site-packages\pytorchrl\agent\env\openai_baselines_dependencies\vec_envs\vec_env_base.py", line 82, in step
self.step_async(actions)
File "C:\ProgramData\BOINC\slots\18\lib\site-packages\pytorchrl\agent\env\vector_wrappers.py", line 63, in step_async
self.venv.step_async(actions.squeeze(0))
File "C:\ProgramData\BOINC\slots\18\lib\site-packages\pytorchrl\agent\env\openai_baselines_dependencies\vec_env\subproc_vec_env.py", line 79, in step_async
remote.send(('step', action))
File "C:\ProgramData\BOINC\slots\18\lib\multiprocessing\connection.py", line 206, in send
self._send_bytes(_ForkingPickler.dumps(obj))
File "C:\ProgramData\BOINC\slots\18\lib\multiprocessing\connection.py", line 280, in _send_bytes
ov, err = _winapi.WriteFile(self._handle, buf, overlapped=True)
BrokenPipeError: [WinError 232] The pipe is being closed
Links to three examples here:
https://www.gpugrid.net/result.php?resultid=32952892
https://www.gpugrid.net/result.php?resultid=32952524
https://www.gpugrid.net/result.php?resultid=32952028
I'm not aware of anything I can do to address these. I think the GPUgrid team needs to look at them and see what they think is going on. |